一种更好的变长字符串相似度排序算法
-
19-08-2019 - |
题
我正在寻找一种字符串相似性算法,它可以在可变长度字符串上产生比通常建议的算法(编辑距离、soundex 等)更好的结果。
例如,
给定字符串 A:“罗伯特”,
然后字符串B:“艾米·罗伯逊”
会比
字符串 C:“理查德”
此外,优选地,该算法应该与语言无关(也适用于英语以外的语言)。
解决方案
Catalysoft的西蒙·怀特写了一篇关于一个非常聪明的算法,该作品真的很好,我的目的相邻字符对进行比较:
http://www.catalysoft.com/articles/StrikeAMatch.html
西蒙算法的一个Java版本,下面我写的是一个PL /红宝石版本(从由Mark皇VanHaren相关论坛进入评论做了简单的红宝石版本采取了),这样我可以用它我PostgreSQL的查询:
CREATE FUNCTION string_similarity(str1 varchar, str2 varchar)
RETURNS float8 AS '
str1.downcase!
pairs1 = (0..str1.length-2).collect {|i| str1[i,2]}.reject {
|pair| pair.include? " "}
str2.downcase!
pairs2 = (0..str2.length-2).collect {|i| str2[i,2]}.reject {
|pair| pair.include? " "}
union = pairs1.size + pairs2.size
intersection = 0
pairs1.each do |p1|
0.upto(pairs2.size-1) do |i|
if p1 == pairs2[i]
intersection += 1
pairs2.slice!(i)
break
end
end
end
(2.0 * intersection) / union
' LANGUAGE 'plruby';
就像一个魅力!
其他提示
马扎冈的回答是巨大的。我它转换为C#,所以我想我在这里发布:
/// <summary>
/// This class implements string comparison algorithm
/// based on character pair similarity
/// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
/// </summary>
public class SimilarityTool
{
/// <summary>
/// Compares the two strings based on letter pair matches
/// </summary>
/// <param name="str1"></param>
/// <param name="str2"></param>
/// <returns>The percentage match from 0.0 to 1.0 where 1.0 is 100%</returns>
public double CompareStrings(string str1, string str2)
{
List<string> pairs1 = WordLetterPairs(str1.ToUpper());
List<string> pairs2 = WordLetterPairs(str2.ToUpper());
int intersection = 0;
int union = pairs1.Count + pairs2.Count;
for (int i = 0; i < pairs1.Count; i++)
{
for (int j = 0; j < pairs2.Count; j++)
{
if (pairs1[i] == pairs2[j])
{
intersection++;
pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success
break;
}
}
}
return (2.0 * intersection) / union;
}
/// <summary>
/// Gets all letter pairs for each
/// individual word in the string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private List<string> WordLetterPairs(string str)
{
List<string> AllPairs = new List<string>();
// Tokenize the string and put the tokens/words into an array
string[] Words = Regex.Split(str, @"\s");
// For each word
for (int w = 0; w < Words.Length; w++)
{
if (!string.IsNullOrEmpty(Words[w]))
{
// Find the pairs of characters
String[] PairsInWord = LetterPairs(Words[w]);
for (int p = 0; p < PairsInWord.Length; p++)
{
AllPairs.Add(PairsInWord[p]);
}
}
}
return AllPairs;
}
/// <summary>
/// Generates an array containing every
/// two consecutive letters in the input string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private string[] LetterPairs(string str)
{
int numPairs = str.Length - 1;
string[] pairs = new string[numPairs];
for (int i = 0; i < numPairs; i++)
{
pairs[i] = str.Substring(i, 2);
}
return pairs;
}
}
下面是马扎冈的回答的另一个版本,这一个用Python写的:
def get_bigrams(string):
"""
Take a string and return a list of bigrams.
"""
s = string.lower()
return [s[i:i+2] for i in list(range(len(s) - 1))]
def string_similarity(str1, str2):
"""
Perform bigram comparison between two strings
and return a percentage match in decimal form.
"""
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
union = len(pairs1) + len(pairs2)
hit_count = 0
for x in pairs1:
for y in pairs2:
if x == y:
hit_count += 1
break
return (2.0 * hit_count) / union
if __name__ == "__main__":
"""
Run a test using the example taken from:
http://www.catalysoft.com/articles/StrikeAMatch.html
"""
w1 = 'Healed'
words = ['Heard', 'Healthy', 'Help', 'Herded', 'Sealed', 'Sold']
for w2 in words:
print('Healed --- ' + w2)
print(string_similarity(w1, w2))
print()
这是我建议的 StrikeAMatch 算法的 PHP 实现,由 Simon White 编写。优点(如链接中所述)是:
词汇相似度的真实反映 - 差异较小的字符串应被识别为相似。特别是,显着的子串重叠应该表明字符串之间具有高度的相似性。
对词序变化的鲁棒性 - 包含相同单词但顺序不同的两个字符串应被识别为相似。另一方面,如果一个字符串只是另一个字符串中包含的字符的随机字谜,那么它(通常)应该被识别为不相似。
语言独立性 - 该算法不仅适用于英语,而且适用于许多不同的语言。
<?php
/**
* LetterPairSimilarity algorithm implementation in PHP
* @author Igal Alkon
* @link http://www.catalysoft.com/articles/StrikeAMatch.html
*/
class LetterPairSimilarity
{
/**
* @param $str
* @return mixed
*/
private function wordLetterPairs($str)
{
$allPairs = array();
// Tokenize the string and put the tokens/words into an array
$words = explode(' ', $str);
// For each word
for ($w = 0; $w < count($words); $w++)
{
// Find the pairs of characters
$pairsInWord = $this->letterPairs($words[$w]);
for ($p = 0; $p < count($pairsInWord); $p++)
{
$allPairs[] = $pairsInWord[$p];
}
}
return $allPairs;
}
/**
* @param $str
* @return array
*/
private function letterPairs($str)
{
$numPairs = mb_strlen($str)-1;
$pairs = array();
for ($i = 0; $i < $numPairs; $i++)
{
$pairs[$i] = mb_substr($str,$i,2);
}
return $pairs;
}
/**
* @param $str1
* @param $str2
* @return float
*/
public function compareStrings($str1, $str2)
{
$pairs1 = $this->wordLetterPairs(strtoupper($str1));
$pairs2 = $this->wordLetterPairs(strtoupper($str2));
$intersection = 0;
$union = count($pairs1) + count($pairs2);
for ($i=0; $i < count($pairs1); $i++)
{
$pair1 = $pairs1[$i];
$pairs2 = array_values($pairs2);
for($j = 0; $j < count($pairs2); $j++)
{
$pair2 = $pairs2[$j];
if ($pair1 === $pair2)
{
$intersection++;
unset($pairs2[$j]);
break;
}
}
}
return (2.0*$intersection)/$union;
}
}
一个较短的版本约翰拉特利奇的回答的:
def get_bigrams(string):
'''
Takes a string and returns a list of bigrams
'''
s = string.lower()
return {s[i:i+2] for i in xrange(len(s) - 1)}
def string_similarity(str1, str2):
'''
Perform bigram comparison between two strings
and return a percentage match in decimal form
'''
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
return (2.0 * len(pairs1 & pairs2)) / (len(pairs1) + len(pairs2))
这个讨论已经真正有用的,谢谢。我转换算法来VBA为使用Excel和写工作表函数,一个的几个版本的一对串的简单比较,其他为一个字符串比较字符串的范围/阵列。所述strSimLookup版本返回任一最后最佳匹配作为一个字符串,数组索引,或相似性度量。
本实施产生在西蒙·怀特的网站上低分匹配的少数例外亚马逊示例中列出了相同的结果;不知道在哪里的差异逐渐在,可能是VBA的分裂功能,但我还没有研究作为它的正常工作,我的目的。
'Implements functions to rate how similar two strings are on
'a scale of 0.0 (completely dissimilar) to 1.0 (exactly similar)
'Source: http://www.catalysoft.com/articles/StrikeAMatch.html
'Author: Bob Chatham, bob.chatham at gmail.com
'9/12/2010
Option Explicit
Public Function stringSimilarity(str1 As String, str2 As String) As Variant
'Simple version of the algorithm that computes the similiarity metric
'between two strings.
'NOTE: This verision is not efficient to use if you're comparing one string
'with a range of other values as it will needlessly calculate the pairs for the
'first string over an over again; use the array-optimized version for this case.
Dim sPairs1 As Collection
Dim sPairs2 As Collection
Set sPairs1 = New Collection
Set sPairs2 = New Collection
WordLetterPairs str1, sPairs1
WordLetterPairs str2, sPairs2
stringSimilarity = SimilarityMetric(sPairs1, sPairs2)
Set sPairs1 = Nothing
Set sPairs2 = Nothing
End Function
Public Function strSimA(str1 As Variant, rRng As Range) As Variant
'Return an array of string similarity indexes for str1 vs every string in input range rRng
Dim sPairs1 As Collection
Dim sPairs2 As Collection
Dim arrOut As Variant
Dim l As Long, j As Long
Set sPairs1 = New Collection
WordLetterPairs CStr(str1), sPairs1
l = rRng.Count
ReDim arrOut(1 To l)
For j = 1 To l
Set sPairs2 = New Collection
WordLetterPairs CStr(rRng(j)), sPairs2
arrOut(j) = SimilarityMetric(sPairs1, sPairs2)
Set sPairs2 = Nothing
Next j
strSimA = Application.Transpose(arrOut)
End Function
Public Function strSimLookup(str1 As Variant, rRng As Range, Optional returnType) As Variant
'Return either the best match or the index of the best match
'depending on returnTYype parameter) between str1 and strings in rRng)
' returnType = 0 or omitted: returns the best matching string
' returnType = 1 : returns the index of the best matching string
' returnType = 2 : returns the similarity metric
Dim sPairs1 As Collection
Dim sPairs2 As Collection
Dim metric, bestMetric As Double
Dim i, iBest As Long
Const RETURN_STRING As Integer = 0
Const RETURN_INDEX As Integer = 1
Const RETURN_METRIC As Integer = 2
If IsMissing(returnType) Then returnType = RETURN_STRING
Set sPairs1 = New Collection
WordLetterPairs CStr(str1), sPairs1
bestMetric = -1
iBest = -1
For i = 1 To rRng.Count
Set sPairs2 = New Collection
WordLetterPairs CStr(rRng(i)), sPairs2
metric = SimilarityMetric(sPairs1, sPairs2)
If metric > bestMetric Then
bestMetric = metric
iBest = i
End If
Set sPairs2 = Nothing
Next i
If iBest = -1 Then
strSimLookup = CVErr(xlErrValue)
Exit Function
End If
Select Case returnType
Case RETURN_STRING
strSimLookup = CStr(rRng(iBest))
Case RETURN_INDEX
strSimLookup = iBest
Case Else
strSimLookup = bestMetric
End Select
End Function
Public Function strSim(str1 As String, str2 As String) As Variant
Dim ilen, iLen1, ilen2 As Integer
iLen1 = Len(str1)
ilen2 = Len(str2)
If iLen1 >= ilen2 Then ilen = ilen2 Else ilen = iLen1
strSim = stringSimilarity(Left(str1, ilen), Left(str2, ilen))
End Function
Sub WordLetterPairs(str As String, pairColl As Collection)
'Tokenize str into words, then add all letter pairs to pairColl
Dim Words() As String
Dim word, nPairs, pair As Integer
Words = Split(str)
If UBound(Words) < 0 Then
Set pairColl = Nothing
Exit Sub
End If
For word = 0 To UBound(Words)
nPairs = Len(Words(word)) - 1
If nPairs > 0 Then
For pair = 1 To nPairs
pairColl.Add Mid(Words(word), pair, 2)
Next pair
End If
Next word
End Sub
Private Function SimilarityMetric(sPairs1 As Collection, sPairs2 As Collection) As Variant
'Helper function to calculate similarity metric given two collections of letter pairs.
'This function is designed to allow the pair collections to be set up separately as needed.
'NOTE: sPairs2 collection will be altered as pairs are removed; copy the collection
'if this is not the desired behavior.
'Also assumes that collections will be deallocated somewhere else
Dim Intersect As Double
Dim Union As Double
Dim i, j As Long
If sPairs1.Count = 0 Or sPairs2.Count = 0 Then
SimilarityMetric = CVErr(xlErrNA)
Exit Function
End If
Union = sPairs1.Count + sPairs2.Count
Intersect = 0
For i = 1 To sPairs1.Count
For j = 1 To sPairs2.Count
If StrComp(sPairs1(i), sPairs2(j)) = 0 Then
Intersect = Intersect + 1
sPairs2.Remove j
Exit For
End If
Next j
Next i
SimilarityMetric = (2 * Intersect) / Union
End Function
我很抱歉,答案不是由作者发明的。这是一个众所周知的算法,该算法是本第一由数字设备公司,并且通常被称为压挤。
http://www.hpl.hp.com/techreports /Compaq-DEC/SRC-TN-1997-015.pdf
我翻译西蒙·怀特的算法向PL / pgSQL。这是我的贡献。
<!-- language: lang-sql -->
create or replace function spt1.letterpairs(in p_str varchar)
returns varchar as
$$
declare
v_numpairs integer := length(p_str)-1;
v_pairs varchar[];
begin
for i in 1 .. v_numpairs loop
v_pairs[i] := substr(p_str, i, 2);
end loop;
return v_pairs;
end;
$$ language 'plpgsql';
--===================================================================
create or replace function spt1.wordletterpairs(in p_str varchar)
returns varchar as
$$
declare
v_allpairs varchar[];
v_words varchar[];
v_pairsinword varchar[];
begin
v_words := regexp_split_to_array(p_str, '[[:space:]]');
for i in 1 .. array_length(v_words, 1) loop
v_pairsinword := spt1.letterpairs(v_words[i]);
if v_pairsinword is not null then
for j in 1 .. array_length(v_pairsinword, 1) loop
v_allpairs := v_allpairs || v_pairsinword[j];
end loop;
end if;
end loop;
return v_allpairs;
end;
$$ language 'plpgsql';
--===================================================================
create or replace function spt1.arrayintersect(ANYARRAY, ANYARRAY)
returns anyarray as
$$
select array(select unnest($1) intersect select unnest($2))
$$ language 'sql';
--===================================================================
create or replace function spt1.comparestrings(in p_str1 varchar, in p_str2 varchar)
returns float as
$$
declare
v_pairs1 varchar[];
v_pairs2 varchar[];
v_intersection integer;
v_union integer;
begin
v_pairs1 := wordletterpairs(upper(p_str1));
v_pairs2 := wordletterpairs(upper(p_str2));
v_union := array_length(v_pairs1, 1) + array_length(v_pairs2, 1);
v_intersection := array_length(arrayintersect(v_pairs1, v_pairs2), 1);
return (2.0 * v_intersection / v_union);
end;
$$ language 'plpgsql';
该算法的一个更快的PHP版本:
/**
*
* @param $str
* @return mixed
*/
private static function wordLetterPairs ($str)
{
$allPairs = array();
// Tokenize the string and put the tokens/words into an array
$words = explode(' ', $str);
// For each word
for ($w = 0; $w < count($words); $w ++) {
// Find the pairs of characters
$pairsInWord = self::letterPairs($words[$w]);
for ($p = 0; $p < count($pairsInWord); $p ++) {
$allPairs[$pairsInWord[$p]] = $pairsInWord[$p];
}
}
return array_values($allPairs);
}
/**
*
* @param $str
* @return array
*/
private static function letterPairs ($str)
{
$numPairs = mb_strlen($str) - 1;
$pairs = array();
for ($i = 0; $i < $numPairs; $i ++) {
$pairs[$i] = mb_substr($str, $i, 2);
}
return $pairs;
}
/**
*
* @param $str1
* @param $str2
* @return float
*/
public static function compareStrings ($str1, $str2)
{
$pairs1 = self::wordLetterPairs(mb_strtolower($str1));
$pairs2 = self::wordLetterPairs(mb_strtolower($str2));
$union = count($pairs1) + count($pairs2);
$intersection = count(array_intersect($pairs1, $pairs2));
return (2.0 * $intersection) / $union;
}
有关我有数据(约2300比较)我的0.58sec与有约色ALKON 溶液与一个运行时间0.35sec与矿。
在美丽的Scala一个版本:
def pairDistance(s1: String, s2: String): Double = {
def strToPairs(s: String, acc: List[String]): List[String] = {
if (s.size < 2) acc
else strToPairs(s.drop(1),
if (s.take(2).contains(" ")) acc else acc ::: List(s.take(2)))
}
val lst1 = strToPairs(s1.toUpperCase, List())
val lst2 = strToPairs(s2.toUpperCase, List())
(2.0 * lst2.intersect(lst1).size) / (lst1.size + lst2.size)
}
下面是R版本:
get_bigrams <- function(str)
{
lstr = tolower(str)
bigramlst = list()
for(i in 1:(nchar(str)-1))
{
bigramlst[[i]] = substr(str, i, i+1)
}
return(bigramlst)
}
str_similarity <- function(str1, str2)
{
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
unionlen = length(pairs1) + length(pairs2)
hit_count = 0
for(x in 1:length(pairs1)){
for(y in 1:length(pairs2)){
if (pairs1[[x]] == pairs2[[y]])
hit_count = hit_count + 1
}
}
return ((2.0 * hit_count) / unionlen)
}
double dice_match(const char *string1, const char *string2) {
//check fast cases
if (((string1 != NULL) && (string1[0] == '\0')) ||
((string2 != NULL) && (string2[0] == '\0'))) {
return 0;
}
if (string1 == string2) {
return 1;
}
size_t strlen1 = strlen(string1);
size_t strlen2 = strlen(string2);
if (strlen1 < 2 || strlen2 < 2) {
return 0;
}
size_t length1 = strlen1 - 1;
size_t length2 = strlen2 - 1;
double matches = 0;
int i = 0, j = 0;
//get bigrams and compare
while (i < length1 && j < length2) {
char a[3] = {string1[i], string1[i + 1], '\0'};
char b[3] = {string2[j], string2[j + 1], '\0'};
int cmp = strcmpi(a, b);
if (cmp == 0) {
matches += 2;
}
i++;
j++;
}
return matches / (length1 + length2);
}
一些测试基于 来源文章:
#include <stdio.h>
void article_test1() {
char *string1 = "FRANCE";
char *string2 = "FRENCH";
printf("====%s====\n", __func__);
printf("%2.f%% == 40%%\n", dice_match(string1, string2) * 100);
}
void article_test2() {
printf("====%s====\n", __func__);
char *string = "Healed";
char *ss[] = {"Heard", "Healthy", "Help",
"Herded", "Sealed", "Sold"};
int correct[] = {44, 55, 25, 40, 80, 0};
for (int i = 0; i < 6; ++i) {
printf("%2.f%% == %d%%\n", dice_match(string, ss[i]) * 100, correct[i]);
}
}
void multicase_test() {
char *string1 = "FRaNcE";
char *string2 = "fREnCh";
printf("====%s====\n", __func__);
printf("%2.f%% == 40%%\n", dice_match(string1, string2) * 100);
}
void gg_test() {
char *string1 = "GG";
char *string2 = "GGGGG";
printf("====%s====\n", __func__);
printf("%2.f%% != 100%%\n", dice_match(string1, string2) * 100);
}
int main() {
article_test1();
article_test2();
multicase_test();
gg_test();
return 0;
}
这是迈克尔·拉Vo1E时的真棒C#版本大厦,按照要求,使其扩展方法,这里是我想出了。做这种方式的主要好处是,你可以按百分比匹配一个泛型列表。例如,考虑你有你的对象为“城市”的字符串场。用户搜索“志士”,并要在比赛递减顺序返回结果。例如,你想切斯特的字面匹配罗切斯特之前露面。要做到这一点,添加两个新的属性到对象:
public string SearchText { get; set; }
public double PercentMatch
{
get
{
return City.ToUpper().PercentMatchTo(this.SearchText.ToUpper());
}
}
然后每个对象上,设置SEARCHTEXT给用户搜索内容。然后,你可以用类似容易对其进行排序:
zipcodes = zipcodes.OrderByDescending(x => x.PercentMatch);
这里的轻微修改以使其扩展方法:
/// <summary>
/// This class implements string comparison algorithm
/// based on character pair similarity
/// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
/// </summary>
public static double PercentMatchTo(this string str1, string str2)
{
List<string> pairs1 = WordLetterPairs(str1.ToUpper());
List<string> pairs2 = WordLetterPairs(str2.ToUpper());
int intersection = 0;
int union = pairs1.Count + pairs2.Count;
for (int i = 0; i < pairs1.Count; i++)
{
for (int j = 0; j < pairs2.Count; j++)
{
if (pairs1[i] == pairs2[j])
{
intersection++;
pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success
break;
}
}
}
return (2.0 * intersection) / union;
}
/// <summary>
/// Gets all letter pairs for each
/// individual word in the string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private static List<string> WordLetterPairs(string str)
{
List<string> AllPairs = new List<string>();
// Tokenize the string and put the tokens/words into an array
string[] Words = Regex.Split(str, @"\s");
// For each word
for (int w = 0; w < Words.Length; w++)
{
if (!string.IsNullOrEmpty(Words[w]))
{
// Find the pairs of characters
String[] PairsInWord = LetterPairs(Words[w]);
for (int p = 0; p < PairsInWord.Length; p++)
{
AllPairs.Add(PairsInWord[p]);
}
}
}
return AllPairs;
}
/// <summary>
/// Generates an array containing every
/// two consecutive letters in the input string
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private static string[] LetterPairs(string str)
{
int numPairs = str.Length - 1;
string[] pairs = new string[numPairs];
for (int i = 0; i < numPairs; i++)
{
pairs[i] = str.Substring(i, 2);
}
return pairs;
}
我的JavaScript实现需要的字符串的字符串或阵列,和一个可选的地板(缺省楼层是0.5)。如果你传递一个字符串,它会返回true或false取决于字符串的相似度得分是否大于或等于地板。如果你传递一个字符串数组,它会返回这些字符串其相似度得分大于或等于到地板上,由程度排序的阵列。
示例:
'Healed'.fuzzy('Sealed'); // returns true
'Healed'.fuzzy('Help'); // returns false
'Healed'.fuzzy('Help', 0.25); // returns true
'Healed'.fuzzy(['Sold', 'Herded', 'Heard', 'Help', 'Sealed', 'Healthy']);
// returns ["Sealed", "Healthy"]
'Healed'.fuzzy(['Sold', 'Herded', 'Heard', 'Help', 'Sealed', 'Healthy'], 0);
// returns ["Sealed", "Healthy", "Heard", "Herded", "Help", "Sold"]
下面,它是:
(function(){
var default_floor = 0.5;
function pairs(str){
var pairs = []
, length = str.length - 1
, pair;
str = str.toLowerCase();
for(var i = 0; i < length; i++){
pair = str.substr(i, 2);
if(!/\s/.test(pair)){
pairs.push(pair);
}
}
return pairs;
}
function similarity(pairs1, pairs2){
var union = pairs1.length + pairs2.length
, hits = 0;
for(var i = 0; i < pairs1.length; i++){
for(var j = 0; j < pairs1.length; j++){
if(pairs1[i] == pairs2[j]){
pairs2.splice(j--, 1);
hits++;
break;
}
}
}
return 2*hits/union || 0;
}
String.prototype.fuzzy = function(strings, floor){
var str1 = this
, pairs1 = pairs(this);
floor = typeof floor == 'number' ? floor : default_floor;
if(typeof(strings) == 'string'){
return str1.length > 1 && strings.length > 1 && similarity(pairs1, pairs(strings)) >= floor || str1.toLowerCase() == strings.toLowerCase();
}else if(strings instanceof Array){
var scores = {};
strings.map(function(str2){
scores[str2] = str1.length > 1 ? similarity(pairs1, pairs(str2)) : 1*(str1.toLowerCase() == str2.toLowerCase());
});
return strings.filter(function(str){
return scores[str] >= floor;
}).sort(function(a, b){
return scores[b] - scores[a];
});
}
};
})();
这是为了您的方便缩小的版本:
(function(){function g(a){var b=[],e=a.length-1,d;a=a.toLowerCase();for(var c=0;c<e;c++)d=a.substr(c,2),/\s/.test(d)||b.push(d);return b}function h(a,b){for(var e=a.length+b.length,d=0,c=0;c<a.length;c++)for(var f=0;f<a.length;f++)if(a[c]==b[f]){b.splice(f--,1);d++;break}return 2*d/e||0}String.prototype.fuzzy=function(a,b){var e=this,d=g(this);b="number"==typeof b?b:0.5;if("string"==typeof a)return 1<e.length&&1<a.length&&h(d,g(a))>=b||e.toLowerCase()==a.toLowerCase();if(a instanceof Array){var c={};a.map(function(a){c[a]=1<e.length?h(d,g(a)):1*(e.toLowerCase()==a.toLowerCase())});return a.filter(function(a){return c[a]>=b}).sort(function(a,b){return c[b]-c[a]})}}})();
在骰子系数算法(西蒙白/马扎冈的回答)的红宝石被实现在 在amatch宝石pair_distance_similar方法
https://github.com/flori/amatch
此宝石也包含了一些近似匹配和字符串比较算法的实现:的Levenshtein编辑距离,卖方编辑距离,汉明距离,最长公共子序列的长度,最长的共同子串的长度,所述一对距离度量,所述哈罗-Winkler度量。
一个Haskell的版本随意提出修改建议,因为我还没有做太多的Haskell。
import Data.Char
import Data.List
-- Convert a string into words, then get the pairs of words from that phrase
wordLetterPairs :: String -> [String]
wordLetterPairs s1 = concat $ map pairs $ words s1
-- Converts a String into a list of letter pairs.
pairs :: String -> [String]
pairs [] = []
pairs (x:[]) = []
pairs (x:ys) = [x, head ys]:(pairs ys)
-- Calculates the match rating for two strings
matchRating :: String -> String -> Double
matchRating s1 s2 = (numberOfMatches * 2) / totalLength
where pairsS1 = wordLetterPairs $ map toLower s1
pairsS2 = wordLetterPairs $ map toLower s2
numberOfMatches = fromIntegral $ length $ pairsS1 `intersect` pairsS2
totalLength = fromIntegral $ length pairsS1 + length pairsS2
Clojure的:
(require '[clojure.set :refer [intersection]])
(defn bigrams [s]
(->> (split s #"\s+")
(mapcat #(partition 2 1 %))
(set)))
(defn string-similarity [a b]
(let [a-pairs (bigrams a)
b-pairs (bigrams b)
total-count (+ (count a-pairs) (count b-pairs))
match-count (count (intersection a-pairs b-pairs))
similarity (/ (* 2 match-count) total-count)]
similarity))
什么Levenshtein距离,由所述第一字符串的长度除以(或备选地分割我的最小/最大/平均值两个字符串的长度)?这为我工作至今。
嗨我这就给在JavaScript一试,但我是新来的吧,谁知道更快的方法可以做到这一点?
function get_bigrams(string) {
// Takes a string and returns a list of bigrams
var s = string.toLowerCase();
var v = new Array(s.length-1);
for (i = 0; i< v.length; i++){
v[i] =s.slice(i,i+2);
}
return v;
}
function string_similarity(str1, str2){
/*
Perform bigram comparison between two strings
and return a percentage match in decimal form
*/
var pairs1 = get_bigrams(str1);
var pairs2 = get_bigrams(str2);
var union = pairs1.length + pairs2.length;
var hit_count = 0;
for (x in pairs1){
for (y in pairs2){
if (pairs1[x] == pairs2[y]){
hit_count++;
}
}
}
return ((2.0 * hit_count) / union);
}
var w1 = 'Healed';
var word =['Heard','Healthy','Help','Herded','Sealed','Sold']
for (w2 in word){
console.log('Healed --- ' + word[w2])
console.log(string_similarity(w1,word[w2]));
}
下面是基于在索伦森-骰子指数(马扎冈的答案),这一个写在相似性的另一版本C ++ 11:
/*
* Similarity based in Sørensen–Dice index.
*
* Returns the Similarity between _str1 and _str2.
*/
double similarity_sorensen_dice(const std::string& _str1, const std::string& _str2) {
// Base case: if some string is empty.
if (_str1.empty() || _str2.empty()) {
return 1.0;
}
auto str1 = upper_string(_str1);
auto str2 = upper_string(_str2);
// Base case: if the strings are equals.
if (str1 == str2) {
return 0.0;
}
// Base case: if some string does not have bigrams.
if (str1.size() < 2 || str2.size() < 2) {
return 1.0;
}
// Extract bigrams from str1
auto num_pairs1 = str1.size() - 1;
std::unordered_set<std::string> str1_bigrams;
str1_bigrams.reserve(num_pairs1);
for (unsigned i = 0; i < num_pairs1; ++i) {
str1_bigrams.insert(str1.substr(i, 2));
}
// Extract bigrams from str2
auto num_pairs2 = str2.size() - 1;
std::unordered_set<std::string> str2_bigrams;
str2_bigrams.reserve(num_pairs2);
for (unsigned int i = 0; i < num_pairs2; ++i) {
str2_bigrams.insert(str2.substr(i, 2));
}
// Find the intersection between the two sets.
int intersection = 0;
if (str1_bigrams.size() < str2_bigrams.size()) {
const auto it_e = str2_bigrams.end();
for (const auto& bigram : str1_bigrams) {
intersection += str2_bigrams.find(bigram) != it_e;
}
} else {
const auto it_e = str1_bigrams.end();
for (const auto& bigram : str2_bigrams) {
intersection += str1_bigrams.find(bigram) != it_e;
}
}
// Returns similarity coefficient.
return (2.0 * intersection) / (num_pairs1 + num_pairs2);
}
我一直在寻找纯Ruby实现由@马扎冈的回答表明了算法的。不幸的是,@marzagao表示链接断开。在@ s01ipsist的答案,他表示红宝石宝石 amatch 地方执行不纯粹的红宝石。所以,我的searchd一点,发现宝石 fuzzy_match 其中有纯Ruby实现(尽管这种宝石使用amatch
)在这里。我希望这会帮助我这样的人。