Question

Is it possible to use Google RE2 with streams? Some input literals that we are suppose to process with regular expressions can potentially be too large to hold in-memory.

No correct solution

OTHER TIPS

If there is a maximum match length, you could read the data in blocks of at least twice that length. If the match fails, or starts less than that many characters from the end, cut the current string, and append another block.

The length of the match string would never be more than the block length + max match length.

Example in C#:

public static IEnumerable<StreamMatch> MatchesInStream(
        this Regex pattern, TextReader reader,
        int maxMatchLength, int blockLength)
{
    if (maxMatchLength <= 0)
    {
        throw new ArgumentException("Must be positive", "maxMatchLength");
    }
    if (blockLength < maxMatchLength)
    {
        throw new ArgumentException("Must be at least as long as maxMatchLength", "blockLength");
    }

    char[] buffer = new char[blockLength];
    string chunk = "";
    int matchOffset = 0;

    // Read one block, and append to the string
    int charsRead = reader.ReadBlock(buffer, 0, blockLength);
    chunk += new string(buffer, 0, charsRead);

    while (charsRead > 0 && chunk.Length > maxMatchLength)
    {
        int cutPosition = 0;
        foreach (Match match in pattern.Matches(chunk))
        {
            if (match.Index > chunk.Length - maxMatchLength)
            {
                // The match could possibly have matched more characters.
                // Read another block before trying again.
                break;
            }

            yield return new StreamMatch(matchOffset, match);
            cutPosition = match.Index + match.Length;
        }
        cutPosition = Math.Max(cutPosition, chunk.Length - maxMatchLength);
        matchOffset += cutPosition;
        chunk = chunk.Substring(cutPosition);

        charsRead = reader.ReadBlock(buffer, 0, blockLength);
        chunk += new string(buffer, 0, charsRead);
    }

    // Stream has ended. Try to match the last remaining characters.
    foreach (Match match in pattern.Matches(chunk))
    {
        yield return new StreamMatch(matchOffset, match);
    }
}

public class StreamMatch
{
    public int MatchOffset { get; private set; }
    public Match Match { get; private set; }

    public StreamMatch(int matchOffset, Match match)
    {
        MatchOffset = matchOffset;
        Match = match;
    }
}
// One horrible XML parser
var reader = new StreamReader(stream);
var pattern = new Regex(@"<(/?)([\w:-]{1,15})([^<>]{0,50}(?<!/))(/?)>");
foreach (StreamMatch match in pattern.MatchesInStream(reader, 69, 128))
{
    Console.WriteLine(match.Match.Value);
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top