Frage

Split a string by commas but ignore commas within double-quotes using Javascript gives the regexp /(".*?"|[^",\s]+)(?=\s*,|\s*$)/. How to get use it in Erlang re:split()? The regexp doesn't work with Erlang.

1> S = "20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0".
2> re:split(S, "(\".*?\"|[^\",\s]+,)(?=\s*,|\s*$)", [{return,list}]).
["20140421,","\"Blah blah, foo foo\"",",1,0,0,0,1,2,0,0"]

The result I'm looking for is the list

["20140421","\"Blah blah, foo foo\"","1","0","0","0","1","2","0","0"]  

Thanks.

War es hilfreich?

Lösung

Just translate the JavaScript regular expression to Erlang:

Erlang R16B03-1 (erts-5.10.4) [source] [64-bit] [smp:8:8] [async-threads:10] [kernel-poll:false]

Eshell V5.10.4  (abort with ^G)
1> S = "20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0".
"20140419,\"Blah blah, foo foo\",1,0,0,0,1,2,0,0"
2> {ok,R} = re:compile("(\".*?\"|[^\",\\s]+)(?=\\s*,|\\s*$)").
{ok,{re_pattern,1,0,
                <<69,82,67,80,122,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,
                  ...>>}}
3> {match,Matches} = re:run(S, R, [{capture,[1],list},global]).
{match,[["20140419"],
        ["\"Blah blah, foo foo\""],
        ["1"],
        ["0"],
        ["0"],
        ["0"],
        ["1"],
        ["2"],
        ["0"],
        ["0"]]}
4> [M || [M] <- Matches].
["20140419","\"Blah blah, foo foo\"","1","0","0","0","1",
 "2","0","0"]

In shell command 2 note the use of double backslashes in the pattern to specify \s correctly.

Andere Tipps

Came up with a parser using pattern matching. Adding it here in case anyone finds it useful.

parse_csv(String) -> parse_csv(String, [], [], [], false).

parse_csv([], S, Acc, [], _) -> lists:reverse(lists:map(fun(X) -> lists:reverse(lists:flatten(X)) end, [Acc|S]));
parse_csv([], S, [], L, _) -> lists:reverse(lists:map(fun(X) -> lists:reverse(lists:flatten(X)) end, [L|S]));
parse_csv(String, S, Acc, L, IsSubStr) ->
    case String of
        [$"|T] when IsSubStr =:= true ->
            % end of substring (ending quote).
            parse_csv(T, S, Acc, [$"|L], false);
        [$"|T] when IsSubStr =:= false  ->
            % beginning of a substring (beginning quote).
            parse_csv(T, S, Acc, [$"], true);
        [$,|T] when IsSubStr =:= true andalso L =/= [] ->
            % comma within a substring
            parse_csv(T, S, Acc, [$,|L], true);
        [$,|T] when IsSubStr =:= false andalso L =/= [] ->
            % comma after a substring.
            parse_csv(T, [[L|Acc]|S], [], [], false);
        [$,|T] when IsSubStr =:= false andalso L =:= [] ->
            % comma after a normal string.
            parse_csv(T, [Acc|S], [], [], false);
        [H|T] when IsSubStr =:= true ->
            % within a substring
            parse_csv(T, S, Acc, [H|L], true);
        [H|T] when IsSubStr =:= false ->
            % a normal string
            parse_csv(T, S, [H|Acc], [], false) end.

Example:

2> ql:parse_csv("foo,\"bar aa\",blah,\"dooo\",phew").                          
["foo","\"bar aa\"","blah","\"dooo\"","phew"]
3> ql:parse_csv("foo,bar,baz").
["foo","bar","baz"]
4> ql:parse_csv("foo,\"foo, bar\",baz").
["foo","\"foo, bar\"","baz"]

How about this:

1> string:tokens(S, ",").
["20140419","\"Blah blah"," foo foo\"","1","0","0","0","1","2","0","0"]

Or even:

2> re:split(S, ",", [{return,list}]).
["20140419","\"Blah blah"," foo foo\"","1","0","0","0","1","2","0","0"]

string:tokens/2's doc.

(@kadaj, that is seriously a lot of code to parse CSVs)

Edit: to properly answer the question, one needs to reassemble "\"…", "…\"" pairs. To do so, a trivial recursive function will do:

finish([[$\"]++Rest=M, Scnd|T], Acc) ->
    finish(T, [M++Scnd|Acc]);
finish([H|T], Acc) ->
    finish(T, [H      |Acc]);
finish([], Acc) ->
    lists:reverse(Acc).
Lizenziert unter: CC-BY-SA mit Zuschreibung
Nicht verbunden mit StackOverflow
scroll top