Вопрос

I am trying to implement exact text search within large amount of text. for that I found some examples of Boyer Moore implementation for c# but now I am having trouble understanding how it works.

for example, If I have string this is sample text to search for and want to fin to search for it works, but if I change my search pattern to be to search for and text it still returns value not -1. why is that happening? there is no string with pattern to search for and text in my search text.

below is an implementation which I found through Stackoverflow

public class BoyerMooreStringSearching
{
    readonly Dictionary<char, LastIndexTable> _lastIndexTable = new Dictionary<char, LastIndexTable>();
    public string PatternToSearch;

    public List<int> GetStartingIndexsOfPatternInText(string textToSearchIn, string patternToSearch)
    {
        var list = new List<int>();
        PatternToSearch = patternToSearch;
        if (patternToSearch != null && !string.IsNullOrEmpty(textToSearchIn))
        {
            UpdateLastIndexTable(patternToSearch);
            PatternToSearch = patternToSearch;

            var j = patternToSearch.Length - 1;

            // Main loop to iterate over whole text
            while (j <= textToSearchIn.Length - 1)
            {
                var lastCharOfPattern = patternToSearch[patternToSearch.Length - 1];

                if (textToSearchIn[j] != lastCharOfPattern)
                {
                    //  Heuristic 1 
                    // If Last Char is not matched with the Last char in pattern and char is not present in the pattern
                    // Then advance pointer 'j' to the length of the pattern in textToSearch.
                    if (!_lastIndexTable.ContainsKey(textToSearchIn[j]))
                    {
                        j += patternToSearch.Length - 1;
                    }

                    // Heuristic 2 
                    // Consult the lastIndex table to get the last index of current char in textToSearch
                    // and advance pointer 'j' to the last index in textToSearch.
                    if (j <= textToSearchIn.Length - 1 && _lastIndexTable.ContainsKey(textToSearchIn[j]))
                    {
                        var tempObj = _lastIndexTable[textToSearchIn[j]];
                        if (tempObj != null) j += tempObj.LastIndex;
                    }
                }

                int k = patternToSearch.Length - 1;
                int u = j;
                if (j <= textToSearchIn.Length - 1)
                {
                    while (k >= 0)
                    {
                        // Heuristic (3a) 
                        // If Last Char is  matched with the Last char in pattern then back track in the text and pattern till 
                        // either you got a complete match or a   mismatched charecter.
                        // Once you got the mismatched char and mismatched char is not present in the pattern then
                        // advance j to the index of mismatched  charecter in the pattern 
                        if (textToSearchIn[u] == patternToSearch[k])
                        {
                            if (k == 0 && textToSearchIn[u] == patternToSearch[k])
                            {
                                list.Add(u);
                                j += patternToSearch.Length - 1;
                            }
                            u--;
                            k--;
                            continue;
                        }

                        if (!_lastIndexTable.ContainsKey(textToSearchIn[u]))
                        {
                            // Heuristic (3b) 
                            // If Last Char is  matched with the Last char in pattern then back track in the text  till 
                            // either you got a complete match or a   mismatched charecter.
                            // Once you got the mismatched char and mismatched char is not present in the pattern then
                            // advance j to the index of mismatched  charecter in the pattern  plus the number to char which matched.
                            j += k + (j - u);
                            break;
                        }

                        k--;
                    }
                }

                j++;
            }

        }
        if (!list.Any())
            list.Add(-1);

        return list;
    }

    private void UpdateLastIndexTable(string patternToSearch)
    {
        _lastIndexTable.Clear();
        var i = patternToSearch.Length - 1;
        foreach (var charToSeach in patternToSearch)
        {
            if (_lastIndexTable.ContainsKey(charToSeach))
            {
                _lastIndexTable[charToSeach].LastIndex = i;
            }
            else
            {
                _lastIndexTable.Add(charToSeach, new LastIndexTable
                {
                    CharSearched = charToSeach,
                    LastIndex = i
                });
            }
            i--;
        }
    }

    private class LastIndexTable
    {
        public char CharSearched { get; set; }

        public int LastIndex { get; set; }
    }
}

and here is the sample usage of it.

var each = "this is sample text to search for";
var result = new BoyerMooreStringSearching().GetStartingIndexsOfPatternInText(each, "to search for and text");
Это было полезно?

Решение

When backtracking (heuristic 3a) you keep finding chars in the searchstring until the end. You need a extra check for this:

if (k == 0 && textToSearchIn[u] == patternToSearch[k])
{
    if (u + patternToSearch.Length <= textToSearchIn.Length)
        list.Add(u);
    j += patternToSearch.Length - 1;
}
Лицензировано под: CC-BY-SA с атрибуция
Не связан с StackOverflow
scroll top