Question

delimiter is |

escaping character is \

and string is for example "A|B\|C\\|D\\\|E|\\\\F"

I want to get array: {"A", "B|C\", "D\|E", "\\F"}

So delimiter can be escaped but escaping character can be also escaped. Does somebody know how to parse this in Java ?

Thanks.

Edit: I created this terribly looking solution. At least It works perfectly and It is possible to define escaping character, delimiter and if empty string should be removed easily.

SOLUTION (Eggyal posted better one, look down):

private List<String> parseString(String string, String delimiter, boolean removeEmpty) {
    String escapingChar = "\\";
    String escapingCharInRegexp = "\\\\";
    boolean begined = false;
    List<String> parsed = new ArrayList<String>();
    List<Integer> begins = new ArrayList<Integer>();
    List<Integer> ends = new ArrayList<Integer>();
    List<Integer> delimitersPositions = new ArrayList<Integer>();
    List<String> explodedParts = new ArrayList<String>();
    int i;
    for(i = 0; i < string.length(); i++) {
        if( ( string.substring(i, i+1).equals(escapingChar) || string.substring(i, i+1).equals(delimiter) ) && !begined ) {
            begins.add(i);
            begined = true;
            if( i + 1 == string.length() ) {
                begined = false;
                ends.add(i+1);
            }
        } else if( ( !string.substring(i, i+1).equals(escapingChar) && !string.substring(i, i+1).equals(delimiter) && begined ) ) {
            begined = false;
            ends.add(i);
        } else if( begined && string.substring(begins.get(begins.size()-1), i).indexOf(delimiter) != -1 ) {
            begined = false;
            ends.add(i);
            begined = true;
            begins.add(i);
        } 
        if( ( i + 1 == string.length() && begined ) ) {
            begined = false;
            ends.add(i+1);
        }
    }
    List<Integer> toRemove = new ArrayList<Integer>();
    for( i = 0; i < begins.size(); i++ ) {
        if( string.substring(begins.get(i), ends.get(i)).indexOf(delimiter) == -1 ) {
            toRemove.add(i);
        }
    }
    for( i = 0; i < toRemove.size(); i++ ) {
        begins.remove(toRemove.get(i)-i);
        ends.remove(toRemove.get(i)-i);
    }       
    for( i = 0; i < begins.size(); i++ ) {
        if( ( ends.get(i) - begins.get(i) ) % 2 != 0 ) {
            delimitersPositions.add(ends.get(i)-1);
        }
    }       
    for( i = 0; i <= delimitersPositions.size(); i++ ) {
        int start = (i == 0) ? 0 : delimitersPositions.get(i-1)+1;
        int end = ( i != delimitersPositions.size()) ? delimitersPositions.get(i) : string.length();
        if( removeEmpty ) {
            if( !string.substring(start, end).equals("") ) {
                explodedParts.add(string.substring(start, end));
            }
        } else {
            explodedParts.add(string.substring(start, end));

        }
    }
    for (i = 0; i < explodedParts.size(); i++)
        parsed.add(explodedParts.get(i).replaceAll(escapingCharInRegexp+"(.)", "$1"));

    return parsed;
}
Was it helpful?

Solution

static final char ESCAPING_CHAR = '\\';

private List<String> parseString(final String  str,
                                 final char    delimiter,
                                 final boolean removeEmpty)
  throws IOException
{
  final Reader        input  = new StringReader(str);
  final StringBuilder part   = new StringBuilder();
  final List<String>  result = new ArrayList<String>();

  int c;
  do {
    c = input.read();                // get the next character

    if (c != delimiter) {            // so long as it isn't a delimiter...
      if (c == ESCAPING_CHAR)        //   if it's an escape
        c = input.read();            //     use the following character instead

      if (c >= 0) {                  //   only if NOT at end of string...
        part.append((char) c);       //     append to current part
        continue;                    //     move on to next character
      }
    }

    /* we're at either a real delimiter, or end of string => part complete */

    if (part.length() > 0 || !removeEmpty) { // keep this part?
      result.add(part.toString());   // add current part to result
      part.setLength(0);             // reset for next part
    }
  } while (c >= 0);                  // repeat until end of string found

  return result;
}

OTHER TIPS

Because you are both splitting and unescaping, you need a separate step for each process:

String[] terms = input.split("(?<=[^\\\\]|[^\\\\]\\\\\\\\)\\|");
for (int i = 0; i < terms.length; i++)
    terms[i] = terms[i].replaceAll("\\\\(.)", "$1");

Here's some test code:

public static void main(String[] args) {
    String input = "A|B\\|C\\\\|D\\\\\\|E|\\\\\\\\F";
    String[] terms = input.split("(?<=[^\\\\]|[^\\\\]\\\\\\\\)\\|");
    for (int i = 0; i < terms.length; i++)
        terms[i] = terms[i].replaceAll("\\\\(.)", "$1");
    System.out.println(input);
    System.out.println(Arrays.toString(terms));
}

Output:

A|B\|C\\|D\\\|E|\\\\F
[A, B|C\, D\|E, \\F]

There is no escape sequence in java like you've mentioned "\|". It'll cause compile time error.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top