Question

NEST doesn't appear to support the pattern replace char filter described here:

http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-pattern-replace-charfilter.html

I've created an issue at https://github.com/elasticsearch/elasticsearch-net/issues/543.

Most of my indexing is working so I would like to continue to use NEST. Is there a way I can work around this using some manual json injection at some point during the index configuration? I'm new to NEST so not sure if this is doable.

Specifically I'm hoping to use the pattern replace char filter to remove unit numbers from a street address before they are run through a custom analyzer (i.e. #205 - 1260 Broadway becomes 1260 Broadway). Because of the custom analyzer, I believe I need to use this char filter to accomplish this.

My current configuration looks like this:

  elasticClient.CreateIndex("geocoding", c => c
            .Analysis(ad => ad
                .Analyzers(ab => ab
                    .Add("address-index", new CustomAnalyzer()
                    {
                        Tokenizer = "whitespace",
                        Filter = new List<string>() { "lowercase", "synonym" }
                    })
                    .Add("address-search", new CustomAnalyzer()
                    {
                        Tokenizer = "whitespace",
                        Filter = new List<string>() { "lowercase" },
                        CharFilter = new List<string>() { "drop-unit" }
                    })
                )
                .CharFilters(cfb => cfb
                    .Add("drop-unit", new CharFilter()) //missing char filter here
                )
                .TokenFilters(tfb => tfb
                    .Add("synonym", new SynonymTokenFilter()
                    {
                        Expand = true,
                        SynonymsPath = "analysis/synonym.txt"
                    })
                )
             )

UPDATE:

As of May 2014, NEST now supports the pattern replace char filter: https://github.com/elasticsearch/elasticsearch-net/pull/637

Was it helpful?

Solution

Instead of using the fluent settings during your index creation you can use the Settings.Add approach to add to the FluentDictionary in a more manual way, but with complete control over what settings are passed in. An example of this is shown in Create Index of the NEST Documentation. I am using this approach for a very similar reason.

You configuration would look something similar to the following:

 elasticClient.CreateIndex("geocoding", c => c.
       .Settings(s => s.
           .Add("analysis.analyzer.address-index.type", "custom")
           .Add("analysis.analyzer.address-index.tokenizer", "whitespace")
           .Add("analysis.analyzer.address-index.filter.0", "lowercase")
           .Add("analysis.analyzer.address-index.filter.1", "synonym")
           .Add("anaylsis.analyzer.address-search.type", "custom")
           .Add("analysis.analyzer.address-search.tokenizer", "whitespace")
           .Add("analysis.analyzer.address-search.filter.0", "lowercase")
           .Add("analysis.analyzer.address-search.char_filter.0", "drop-unit")
           .Add("analysis.char_filter.drop-unit.type", "mapping")
           .Add("analysis.char_filter.drop-unit.mappings.0", "<mapping1>")
           .Add("analysis.char_filter.drop-unit.mappings.1", "<mapping2>")
           ...
       )
  );

You will need to replace <mapping1> and <mapping2> above with your actual char_filter mappings that you want to use. Please note that I have not used a char_filter before, so the settings values may be a little off, but should get you going in the right direction.

OTHER TIPS

Just to provide a follow-up to Paige's very helpful answer, it looks like you can combine the fluent and manual Settings.Add approaches. The following worked for me:

     elasticClient.CreateIndex("geocoding", c => c
            .Settings(s => s
                .Add("analysis.char_filter.drop_unit.type", "pattern_replace")
                .Add("analysis.char_filter.drop_unit.pattern", @"#\d+\s-\s")
                .Add("analysis.char_filter.drop_unit.replacement", "")
            )
            .Analysis(ad => ad
                .Analyzers(ab => ab
                    .Add("address_index", new CustomAnalyzer()
                    {
                        Tokenizer = "whitespace",
                        Filter = new List<string>() { "lowercase", "synonym" }
                    })
                    .Add("address_search", new CustomAnalyzer()
                    {
                        CharFilter = new List<string> { "drop_unit" },
                        Tokenizer = "whitespace",
                        Filter = new List<string>() { "lowercase" }
                    })
                )
                .TokenFilters(tfb => tfb
                    .Add("synonym", new SynonymTokenFilter()
                    {
                        Expand = true,
                        SynonymsPath = "analysis/synonym.txt"
                    })
                )
             )
       EsClient.CreateIndex("universal_de", c => c
         .NumberOfReplicas(1)
         .NumberOfShards(5)
         .Settings(s => s //just as an example
             .Add("merge.policy.merge_factor", "10")
             .Add("search.slowlog.threshold.fetch.warn", "1s")
             .Add("analysis.char_filter.drop_chars.type", "pattern_replace")
             .Add("analysis.char_filter.drop_chars.pattern", @"[^0-9]")
             .Add("analysis.char_filter.drop_chars.replacement", "")
             .Add("analysis.char_filter.drop_specChars.type", "pattern_replace")
             .Add("analysis.char_filter.drop_specChars.pattern", @"[^0-9a-zA-Z]")
             .Add("analysis.char_filter.drop_specChars.replacement", "")
         )
         .Analysis(descriptor => descriptor
            .Analyzers(bases => bases
                .Add("folded_word", new CustomAnalyzer() 
                {
                    Filter = new List<string> { "lowercase", "asciifolding", "trim" },
                    Tokenizer = "standard"
                }
                )
                .Add("trimmed_number", new CustomAnalyzer()
                {
                    CharFilter = new List<string> { "drop_chars" },
                    Tokenizer = "standard",
                    Filter = new List<string>() { "lowercase" }
                })
                .Add("trimmed_specChars", new CustomAnalyzer()
                {
                    CharFilter = new List<string> { "drop_specChars" },
                    Tokenizer = "standard",
                    Filter = new List<string>() { "lowercase" }
                })
            )
         )
            .AddMapping<Business>(m => m
                //.MapFromAttributes()
                .Properties(props => props
                    .MultiField(mf => mf
                        .Name(t => t.DirectoryName)
                        .Fields(fs => fs
                            .String(s => s.Name(t => t.DirectoryName).Analyzer("standard"))
                            .String(s => s.Name(t => t.DirectoryName.Suffix("folded")).Analyzer("folded_word"))
                            )
                    )
                    .MultiField(mf => mf
                        .Name(t => t.Phone)
                        .Fields(fs => fs
                            .String(s => s.Name(t => t.Phone).Analyzer("trimmed_number"))
                            )
                    )

This is how you create the index and add the mapping. Now to search i have something like this :

  var result = _Instance.Search<Business>(q => q
                .TrackScores(true)
                .Query(qq =>
                {
                    QueryContainer termQuery = null;
                if (!string.IsNullOrWhiteSpace(input.searchTerm))
                {
                    var toLowSearchTerm = input.searchTerm.ToLower();
                    termQuery |= qq.QueryString(qs => qs
                        .OnFieldsWithBoost(f => f
                            .Add("directoryName.folded", 5.0)
                         )
                         .Query(toLowSearchTerm));
                        termQuery |= qq.Fuzzy(fz => fz.OnField("directoryName.folded").Value(toLowSearchTerm).MaxExpansions(2));
                        termQuery |= qq.Term("phone", Regex.Replace(toLowSearchTerm, @"[^0-9]", ""));


                 }

                 return termQuery;
                })
                .Skip(input.skip)
                .Take(input.take)
            );

NEW: I managed to use the pattern replace in a better way like this :

    .Analysis(descriptor => descriptor
        .Analyzers(bases => bases
            .Add("folded_word", new CustomAnalyzer()
            {
                Filter = new List<string> { "lowercase", "asciifolding", "trim" },
                Tokenizer = "standard"
            }
            )
            .Add("trimmed_number", new CustomAnalyzer()
            {
                CharFilter = new List<string> { "drop_chars" },
                Tokenizer = "standard",
                Filter = new List<string>() { "lowercase" }
            })
            .Add("trimmed_specChars", new CustomAnalyzer()
            {
                CharFilter = new List<string> { "drop_specChars" },
                Tokenizer = "standard",
                Filter = new List<string>() { "lowercase" }
            })
            .Add("autocomplete", new CustomAnalyzer()
            {
                Tokenizer = new WhitespaceTokenizer().Type,
                Filter = new List<string>() { "lowercase", "asciifolding", "trim", "engram" }
            }
            )
     )
     .TokenFilters(i => i
                 .Add("engram", new EdgeNGramTokenFilter
                     {
                         MinGram = 3,
                         MaxGram = 15
                     }
                 )
    )
    .CharFilters(cf => cf
                 .Add("drop_chars", new PatternReplaceCharFilter
                     {
                         Pattern = @"[^0-9]",
                         Replacement = ""
                     }
                 )
                 .Add("drop_specChars", new PatternReplaceCharFilter
                 {
                     Pattern = @"[^0-9a-zA-Z]",
                     Replacement = ""
                 }
                 )
    )
    )
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top