Question

I need to calculate the similarity between 2 strings. So what exactly do I mean? Let me explain with an example:

  • The real word: hospital
  • Mistaken word: haspita

Now my aim is to determine how many characters I need to modify the mistaken word to obtain the real word. In this example, I need to modify 2 letters. So what would be the percent? I take the length of the real word always. So it becomes 2 / 8 = 25% so these 2 given string DSM is 75%.

How can I achieve this with performance being a key consideration?

Was it helpful?

Solution

What you are looking for is called edit distance or Levenshtein distance. The wikipedia article explains how it is calculated, and has a nice piece of pseudocode at the bottom to help you code this algorithm in C# very easily.

Here's an implementation from the first site linked below:

private static int  CalcLevenshteinDistance(string a, string b)
    {
    if (String.IsNullOrEmpty(a) && String.IsNullOrEmpty(b)) {
        return 0;
    }
    if (String.IsNullOrEmpty(a)) {
        return b.Length;
    }
    if (String.IsNullOrEmpty(b)) {
        return a.Length;
    }
    int  lengthA   = a.Length;
    int  lengthB   = b.Length;
    var  distances = new int[lengthA + 1, lengthB + 1];
    for (int i = 0;  i <= lengthA;  distances[i, 0] = i++);
    for (int j = 0;  j <= lengthB;  distances[0, j] = j++);

    for (int i = 1;  i <= lengthA;  i++)
        for (int j = 1;  j <= lengthB;  j++)
            {
            int  cost = b[j - 1] == a[i - 1] ? 0 : 1;
            distances[i, j] = Math.Min
                (
                Math.Min(distances[i - 1, j] + 1, distances[i, j - 1] + 1),
                distances[i - 1, j - 1] + cost
                );
            }
    return distances[lengthA, lengthB];
    }

OTHER TIPS

I just addressed this exact same issue a few weeks ago. Since someone is asking now, I'll share the code. In my exhaustive tests my code is about 10x faster than the C# example on Wikipedia even when no maximum distance is supplied. When a maximum distance is supplied, this performance gain increases to 30x - 100x +. Note a couple key points for performance:

  • If you need to compare the same words over and over, first convert the words to arrays of integers. The Damerau-Levenshtein algorithm includes many >, <, == comparisons, and ints compare much faster than chars.
  • It includes a short-circuiting mechanism to quit if the distance exceeds a provided maximum
  • Use a rotating set of three arrays rather than a massive matrix as in all the implementations I've see elsewhere
  • Make sure your arrays slice accross the shorter word width.

Code (it works the exact same if you replace int[] with String in the parameter declarations:

/// <summary>
/// Computes the Damerau-Levenshtein Distance between two strings, represented as arrays of
/// integers, where each integer represents the code point of a character in the source string.
/// Includes an optional threshhold which can be used to indicate the maximum allowable distance.
/// </summary>
/// <param name="source">An array of the code points of the first string</param>
/// <param name="target">An array of the code points of the second string</param>
/// <param name="threshold">Maximum allowable distance</param>
/// <returns>Int.MaxValue if threshhold exceeded; otherwise the Damerau-Leveshteim distance between the strings</returns>
public static int DamerauLevenshteinDistance(int[] source, int[] target, int threshold) {

    int length1 = source.Length;
    int length2 = target.Length;

    // Return trivial case - difference in string lengths exceeds threshhold
    if (Math.Abs(length1 - length2) > threshold) { return int.MaxValue; }

    // Ensure arrays [i] / length1 use shorter length 
    if (length1 > length2) {
        Swap(ref target, ref source);
        Swap(ref length1, ref length2);
    }

    int maxi = length1;
    int maxj = length2;

    int[] dCurrent = new int[maxi + 1];
    int[] dMinus1 = new int[maxi + 1];
    int[] dMinus2 = new int[maxi + 1];
    int[] dSwap;

    for (int i = 0; i <= maxi; i++) { dCurrent[i] = i; }

    int jm1 = 0, im1 = 0, im2 = -1;

    for (int j = 1; j <= maxj; j++) {

        // Rotate
        dSwap = dMinus2;
        dMinus2 = dMinus1;
        dMinus1 = dCurrent;
        dCurrent = dSwap;

        // Initialize
        int minDistance = int.MaxValue;
        dCurrent[0] = j;
        im1 = 0;
        im2 = -1;

        for (int i = 1; i <= maxi; i++) {

            int cost = source[im1] == target[jm1] ? 0 : 1;

            int del = dCurrent[im1] + 1;
            int ins = dMinus1[i] + 1;
            int sub = dMinus1[im1] + cost;

            //Fastest execution for min value of 3 integers
            int min = (del > ins) ? (ins > sub ? sub : ins) : (del > sub ? sub : del);

            if (i > 1 && j > 1 && source[im2] == target[jm1] && source[im1] == target[j - 2])
                min = Math.Min(min, dMinus2[im2] + cost);

            dCurrent[i] = min;
            if (min < minDistance) { minDistance = min; }
            im1++;
            im2++;
        }
        jm1++;
        if (minDistance > threshold) { return int.MaxValue; }
    }

    int result = dCurrent[maxi];
    return (result > threshold) ? int.MaxValue : result;
}

Where Swap is:

static void Swap<T>(ref T arg1,ref T arg2) {
    T temp = arg1;
    arg1 = arg2;
    arg2 = temp;
}

There is a big number of string similarity distance algorithms that can be used. Some listed here (but not exhaustively listed are):

A library that contains implementation to all of these is called SimMetrics which has both java and c# implementations.

I have found that Levenshtein and Jaro Winkler are great for small differences betwen strings such as:

  • Spelling mistakes; or
  • ö instead of o in a persons name.

However when comparing something like article titles where significant chunks of the text would be the same but with "noise" around the edges, Smith-Waterman-Gotoh has been fantastic:

compare these 2 titles (that are the same but worded differently from different sources):

An endonuclease from Escherichia coli that introduces single polynucleotide chain scissions in ultraviolet-irradiated DNA

Endonuclease III: An Endonuclease from Escherichia coli That Introduces Single Polynucleotide Chain Scissions in Ultraviolet-Irradiated DNA

This site that provides algorithm comparison of the strings shows:

  • Levenshtein: 81
  • Smith-Waterman Gotoh 94
  • Jaro Winkler 78

Jaro Winkler and Levenshtein are not as competent as Smith Waterman Gotoh in detecting the similarity. If we compare two titles that are not the same article, but have some matching text:

Fat metabolism in higher plants. The function of acyl thioesterases in the metabolism of acyl-coenzymes A and acyl-acyl carrier proteins

Fat metabolism in higher plants. The determination of acyl-acyl carrier protein and acyl coenzyme A in a complex lipid mixture

Jaro Winkler gives a false positive, but Smith Waterman Gotoh does not:

  • Levenshtein: 54
  • Smith-Waterman Gotoh 49
  • Jaro Winkler 89

As Anastasiosyal pointed out, SimMetrics has the java code for these algorithms. I had success using the SmithWatermanGotoh java code from SimMetrics.

Here is my implementation of Damerau Levenshtein Distance, which returns not only similarity coefficient, but also returns error locations in corrected word (this feature can be used in text editors). Also my implementation supports different weights of errors (substitution, deletion, insertion, transposition).

public static List<Mistake> OptimalStringAlignmentDistance(
  string word, string correctedWord,
  bool transposition = true,
  int substitutionCost = 1,
  int insertionCost = 1,
  int deletionCost = 1,
  int transpositionCost = 1)
{
    int w_length = word.Length;
    int cw_length = correctedWord.Length;
    var d = new KeyValuePair<int, CharMistakeType>[w_length + 1, cw_length + 1];
    var result = new List<Mistake>(Math.Max(w_length, cw_length));

    if (w_length == 0)
    {
        for (int i = 0; i < cw_length; i++)
            result.Add(new Mistake(i, CharMistakeType.Insertion));
        return result;
    }

    for (int i = 0; i <= w_length; i++)
        d[i, 0] = new KeyValuePair<int, CharMistakeType>(i, CharMistakeType.None);

    for (int j = 0; j <= cw_length; j++)
        d[0, j] = new KeyValuePair<int, CharMistakeType>(j, CharMistakeType.None);

    for (int i = 1; i <= w_length; i++)
    {
        for (int j = 1; j <= cw_length; j++)
        {
            bool equal = correctedWord[j - 1] == word[i - 1];
            int delCost = d[i - 1, j].Key + deletionCost;
            int insCost = d[i, j - 1].Key + insertionCost;
            int subCost = d[i - 1, j - 1].Key;
            if (!equal)
                subCost += substitutionCost;
            int transCost = int.MaxValue;
            if (transposition && i > 1 && j > 1 && word[i - 1] == correctedWord[j - 2] && word[i - 2] == correctedWord[j - 1])
            {
                transCost = d[i - 2, j - 2].Key;
                if (!equal)
                    transCost += transpositionCost;
            }

            int min = delCost;
            CharMistakeType mistakeType = CharMistakeType.Deletion;
            if (insCost < min)
            {
                min = insCost;
                mistakeType = CharMistakeType.Insertion;
            }
            if (subCost < min)
            {
                min = subCost;
                mistakeType = equal ? CharMistakeType.None : CharMistakeType.Substitution;
            }
            if (transCost < min)
            {
                min = transCost;
                mistakeType = CharMistakeType.Transposition;
            }

            d[i, j] = new KeyValuePair<int, CharMistakeType>(min, mistakeType);
        }
    }

    int w_ind = w_length;
    int cw_ind = cw_length;
    while (w_ind >= 0 && cw_ind >= 0)
    {
        switch (d[w_ind, cw_ind].Value)
        {
            case CharMistakeType.None:
                w_ind--;
                cw_ind--;
                break;
            case CharMistakeType.Substitution:
                result.Add(new Mistake(cw_ind - 1, CharMistakeType.Substitution));
                w_ind--;
                cw_ind--;
                break;
            case CharMistakeType.Deletion:
                result.Add(new Mistake(cw_ind, CharMistakeType.Deletion));
                w_ind--;
                break;
            case CharMistakeType.Insertion:
                result.Add(new Mistake(cw_ind - 1, CharMistakeType.Insertion));
                cw_ind--;
                break;
            case CharMistakeType.Transposition:
                result.Add(new Mistake(cw_ind - 2, CharMistakeType.Transposition));
                w_ind -= 2;
                cw_ind -= 2;
                break;
        }
    }
    if (d[w_length, cw_length].Key > result.Count)
    {
        int delMistakesCount = d[w_length, cw_length].Key - result.Count;
        for (int i = 0; i < delMistakesCount; i++)
            result.Add(new Mistake(0, CharMistakeType.Deletion));
    }

    result.Reverse();

    return result;
}

public struct Mistake
{
    public int Position;
    public CharMistakeType Type;

    public Mistake(int position, CharMistakeType type)
    {
        Position = position;
        Type = type;
    }

    public override string ToString()
    {
        return Position + ", " + Type;
    }
}

public enum CharMistakeType
{
    None,
    Substitution,
    Insertion,
    Deletion,
    Transposition
}

This code is a part of my project: Yandex-Linguistics.NET.

I wrote some tests and it's seems to me that method is working.

But comments and remarks are welcome.

Here is an alternative approach:

This is too long for a comment.

A typical method for finding similarity is Levenshtein distance, and there is no doubt a library with code available.

Unfortunately, this requires comparing to every string. You might be able to write a specialized version of the code to short-circuit the calculation if the distance is greater than some threshold, you would still have to do all the comparisons.

Another idea is to use some variant of trigrams or n-grams. These are sequences of n characters (or n words or n genomic sequences or n whatever). Keep a mapping of trigrams to strings and choose the ones that have the biggest overlap. A typical choice of n is "3", hence the name.

For instance, English would have these trigrams:

  • Eng
  • ngl
  • gli
  • lis
  • ish

And England would have:

  • Eng
  • ngl
  • gla
  • lan
  • and

Well, 2 out of 7 (or 4 out of 10) match. If this works for you, and you can index the trigram/string table and get a faster search.

You can also combine this with Levenshtein to reduce the set of comparison to those that have some minimum number of n-grams in common.

Here's a VB.net implementation:

Public Shared Function LevenshteinDistance(ByVal v1 As String, ByVal v2 As String) As Integer
    Dim cost(v1.Length, v2.Length) As Integer
    If v1.Length = 0 Then
        Return v2.Length                'if string 1 is empty, the number of edits will be the insertion of all characters in string 2
    ElseIf v2.Length = 0 Then
        Return v1.Length                'if string 2 is empty, the number of edits will be the insertion of all characters in string 1
    Else
        'setup the base costs for inserting the correct characters
        For v1Count As Integer = 0 To v1.Length
            cost(v1Count, 0) = v1Count
        Next v1Count
        For v2Count As Integer = 0 To v2.Length
            cost(0, v2Count) = v2Count
        Next v2Count
        'now work out the cheapest route to having the correct characters
        For v1Count As Integer = 1 To v1.Length
            For v2Count As Integer = 1 To v2.Length
                'the first min term is the cost of editing the character in place (which will be the cost-to-date or the cost-to-date + 1 (depending on whether a change is required)
                'the second min term is the cost of inserting the correct character into string 1 (cost-to-date + 1), 
                'the third min term is the cost of inserting the correct character into string 2 (cost-to-date + 1) and 
                cost(v1Count, v2Count) = Math.Min(
                    cost(v1Count - 1, v2Count - 1) + If(v1.Chars(v1Count - 1) = v2.Chars(v2Count - 1), 0, 1),
                    Math.Min(
                        cost(v1Count - 1, v2Count) + 1,
                        cost(v1Count, v2Count - 1) + 1
                    )
                )
            Next v2Count
        Next v1Count

        'the final result is the cheapest cost to get the two strings to match, which is the bottom right cell in the matrix
        'in the event of strings being equal, this will be the result of zipping diagonally down the matrix (which will be square as the strings are the same length)
        Return cost(v1.Length, v2.Length)
    End If
End Function
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top