Question

I have a simple custom analyzer that appears to properly generate phonetic hashes in a index from SQL server. It appears most attempts to query indexes generated with my custom analyzer return no results. I haven't been able to find similar cases so I must certainly be doing something wrong.

Custom filter:

internal class SoundexFilter : TokenFilter
{
    private readonly ITermAttribute _termAttr;

    private Queue<Token> soundexTokenQueue
        = new Queue<Token>();

    public SoundexFilter(TokenStream input)
        : base(input)
    {
        _termAttr = AddAttribute<ITermAttribute>();
    }

    public override bool IncrementToken()
    {
        if (input.IncrementToken())
        {
            string currentTerm = _termAttr.Term;
            var hash = Soundex.For(currentTerm);
            Console.WriteLine("Original: {0}, Hash: {1}", currentTerm, hash);
            soundexTokenQueue.Enqueue(new Token(hash, 0, hash.Length));
            return true;
        }
        else if (soundexTokenQueue.Count > 0)
        {
            var token = soundexTokenQueue.Dequeue();

            _termAttr.SetTermBuffer(token.Term);
            _termAttr.SetTermLength(token.TermLength());
            return true;
        }

        return false;
    }
}

Custom analyzer:

public class SoundexAnalyzer : Analyzer
{
    public override TokenStream TokenStream(string fieldName, TextReader reader)
    {
        //create the tokenizer
        TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);

        //add in filters
        result = new StandardFilter(result);

        // Add soundex filter
        result = new SoundexFilter(result);

        return result;
    }
}

Simple test program:

public class Program
{
    private const string NAME = "John Smith";
    private const string SEARCH_NAME = "John Smith";

    private Analyzer _analyzer = new SoundexAnalyzer();
    private Directory _directory = new RAMDirectory();

    internal void Run(string[] args)
    {
        using (var writer = new IndexWriter(_directory, _analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
        {
            var field = new Field("Name", NAME, Field.Store.YES, Field.Index.ANALYZED);

            var document = new Document();

            document.Add(field);

            writer.AddDocument(document);

            // Unnecessary but helps imply intent
            writer.Commit();
        }

        using (var searcher = new IndexSearcher(_directory))
        {
            var parser = new QueryParser(Version.LUCENE_30, "Name", _analyzer);
            var query = parser.Parse(SEARCH_NAME);
            var docs = searcher.Search(query, 10);

            Console.WriteLine("\nReturned Docs:");

            foreach (var scoreDoc in docs.ScoreDocs)
            {
                var doc = searcher.Doc(scoreDoc.Doc);

                Console.WriteLine(doc.Get("Name"));
            }
        }
    }

    private static void Main(string[] args)
    {
        new Program().Run(args);
    }
}

The only search that succeeds using this code is an exact match like NAME = "John" and SEARCH_NAME = "John".

The strange thing is searching in Luke with the standard analyzers for the phonetic hashes works fine, so the write must be working as expected (or at least how I expect).

I've done a fair amount of research around this and have little help. Any idea what I'm missing?

Was it helpful?

Solution

I figured out what solves the problem but haven't quite figured out exactly why it's a problem.

Basically, my TokenFilter implementation included in the question is attempting to do too much and doesn't appear to align with the expectations of Lucene.

By limiting the IncrementToken implementation to perform just the phonetic hash and replace the ITermAttribute.Term value with the generated hash, it works quite well.

TokenFilter implementation:

public class SoundexFilter : TokenFilter
{
    private readonly ITermAttribute _termAttr;

    public SoundexFilter(TokenStream input)
        : base(input)
    {
        _termAttr = AddAttribute<ITermAttribute>();
    }

    public override bool IncrementToken()
    {
        if (input.IncrementToken())
        {
            string currentTerm = _termAttr.Term;
            // Any phonetic hash calculation will work here.
            var hash = Soundex.For(currentTerm);
            _termAttr.SetTermBuffer(hash);
            return true;
        }

        return false;
    }
}

The result requires the same filter to be applied at both index and query time, but it works extremely well.

As a side note, performance of this filter doesn't appear to match my expectations so I'll be profiling the solution to identify possible enhancements. I'd recommend anyone looking to use this solution do the same if they expect sub-second response time for an index with > 2 million documents.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top