Domanda

I am running .NET 4.5 and Lucene.Net 3.0.3 and trying to "fix" the ASCIIFoldingFilter for umlauts-behavior (as ä gets converted to a instead of ae - and searches with eg Geschäft and Geschaeft should work alike).

I have already implemented my own analyzer:

public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer
{
    public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader)
    {
        var keywordTokenizer = base.TokenStream(fieldName, reader);
        var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer);
        var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter);

        return asciiFoldingFilter;
    }
}

Now I have tried to add a Lucene.Net.Analysis.MappingCharFilter-instance like:

public sealed class LowerCaseKeywordAnalyzer : Lucene.Net.Analysis.KeywordAnalyzer
{
    public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, TextReader reader)
    {
        var keywordTokenizer = base.TokenStream(fieldName, reader);
        var lowerCaseFilter = new Lucene.Net.Analysis.LowerCaseFilter(keywordTokenizer);
        var mappingCharFilter = new Lucene.Net.Analysis.MappingCharFilter(/* get map from somewhere*/, ???);
        var asciiFoldingFilter = new Lucene.Net.Analysis.ASCIIFoldingFilter(lowerCaseFilter);

        return asciiFoldingFilter;
    }
}

But how do I inject either a CharStream- or TextReader-instance? I just have a Lucene.Net.Analysis.TokenStream-instance (either Lucene.Net.Analysis.LowerCaseFilter or base.TokenStream()) ...
Any chance to get this to work, except writing a customLucene.Net.Analysis.TokenFilter which does the job?

È stato utile?

Soluzione

I have implemented my own Lucene.Net.Analysis.TokenFilter:

public sealed class UmlautsFoldingFilter : Lucene.Net.Analysis.TokenFilter
{
    private readonly Lucene.Net.Analysis.Tokenattributes.ITermAttribute _termAttribute;
    private char[] _output = new char[512];
    private int _outputPosition;

    public UmlautsFoldingFilter(Lucene.Net.Analysis.TokenStream input)
        : base(input)
    {
        this._termAttribute = this.AddAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
    }

    public override bool IncrementToken()
    {
        if (!this.input.IncrementToken())
        {
            return false;
        }

        var termBuffer = this._termAttribute.TermBuffer();
        var termLength = this._termAttribute.TermLength();

        this.FoldUmlaut(termBuffer,
                        termLength);

        this._termAttribute.SetTermBuffer(this._output,
                                          0,
                                          this._outputPosition);

        return true;
    }

    private void FoldUmlaut(char[] termBuffer,
                            int termLength)
    {
        var targetSize = 4 * termLength;
        if (this._output.Length < targetSize)
        {
            this._output = new char[Lucene.Net.Util.ArrayUtil.GetNextSize(targetSize)];
        }
        this._outputPosition = 0;
        for (var index = 0;
             index < termLength;
             ++index)
        {
            var ch = termBuffer[index];
            switch (ch)
            {
                case 'Ä':
                    this._output[this._outputPosition++] = 'A';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'Ö':
                    this._output[this._outputPosition++] = 'O';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'Ü':
                    this._output[this._outputPosition++] = 'U';
                    this._output[this._outputPosition++] = 'E';
                    continue;
                case 'ä':
                    this._output[this._outputPosition++] = 'a';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                case 'ö':
                    this._output[this._outputPosition++] = 'o';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                case 'ü':
                    this._output[this._outputPosition++] = 'u';
                    this._output[this._outputPosition++] = 'e';
                    continue;
                default:
                    this._output[this._outputPosition++] = ch;
                    continue;
            }
        }
    }
}

Altri suggerimenti

Here an answer to an old question, but might be nevertheless relevant, as the original answer does not show the use of the NormalizeCharMap in Lucene.net. In your Analyzer class override also the InitReader like this:

public override TextReader InitReader(string fieldName, TextReader reader)
    {
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        builder.Add("Ä", "AE");
        builder.Add("ä", "ae");
        builder.Add("Ö", "OE");
        builder.Add("ö", "oe");
        builder.Add("Ü", "UE");
        builder.Add("ü", "ue");
        return new MappingCharFilter(builder.Build(), reader);
     }

This should give you the required replacements/normlizations.

Autorizzato sotto: CC-BY-SA insieme a attribuzione
Non affiliato a StackOverflow
scroll top