Is it possible to use Google RE2 with streams? Some input literals that we are suppose to process with regular expressions can potentially be too large to hold in-memory.

没有正确的解决方案

其他提示

If there is a maximum match length, you could read the data in blocks of at least twice that length. If the match fails, or starts less than that many characters from the end, cut the current string, and append another block.

The length of the match string would never be more than the block length + max match length.

Example in C#:

public static IEnumerable<StreamMatch> MatchesInStream(
        this Regex pattern, TextReader reader,
        int maxMatchLength, int blockLength)
{
    if (maxMatchLength <= 0)
    {
        throw new ArgumentException("Must be positive", "maxMatchLength");
    }
    if (blockLength < maxMatchLength)
    {
        throw new ArgumentException("Must be at least as long as maxMatchLength", "blockLength");
    }

    char[] buffer = new char[blockLength];
    string chunk = "";
    int matchOffset = 0;

    // Read one block, and append to the string
    int charsRead = reader.ReadBlock(buffer, 0, blockLength);
    chunk += new string(buffer, 0, charsRead);

    while (charsRead > 0 && chunk.Length > maxMatchLength)
    {
        int cutPosition = 0;
        foreach (Match match in pattern.Matches(chunk))
        {
            if (match.Index > chunk.Length - maxMatchLength)
            {
                // The match could possibly have matched more characters.
                // Read another block before trying again.
                break;
            }

            yield return new StreamMatch(matchOffset, match);
            cutPosition = match.Index + match.Length;
        }
        cutPosition = Math.Max(cutPosition, chunk.Length - maxMatchLength);
        matchOffset += cutPosition;
        chunk = chunk.Substring(cutPosition);

        charsRead = reader.ReadBlock(buffer, 0, blockLength);
        chunk += new string(buffer, 0, charsRead);
    }

    // Stream has ended. Try to match the last remaining characters.
    foreach (Match match in pattern.Matches(chunk))
    {
        yield return new StreamMatch(matchOffset, match);
    }
}

public class StreamMatch
{
    public int MatchOffset { get; private set; }
    public Match Match { get; private set; }

    public StreamMatch(int matchOffset, Match match)
    {
        MatchOffset = matchOffset;
        Match = match;
    }
}
// One horrible XML parser
var reader = new StreamReader(stream);
var pattern = new Regex(@"<(/?)([\w:-]{1,15})([^<>]{0,50}(?<!/))(/?)>");
foreach (StreamMatch match in pattern.MatchesInStream(reader, 69, 128))
{
    Console.WriteLine(match.Match.Value);
}
许可以下: CC-BY-SA归因
不隶属于 StackOverflow
scroll top