Since you are comparing against the same word over and over, you can get a little performance improvement by using partial application and caching there:
function levenshtein(s1) {
var row0 = [], row1 = [], s1_len = s1.length;
if (s1_len === 0)
return function(s2) {
return s2.length;
};
return function(s2) {
var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, i = 0;
if (s2_len === 0)
return s1_len;
…
return b;
};
}
I also repeated myself a bit by linearizing the loop somewhat.
Not sure whether it gets lot faster, but you can omit one of the arrays - you do not need to read/write them in an alternating manner:
function levenshtein(s1) {
var s1_len = s1.length, row = new Array(s1_len);
if (s1_len === 0)
return function(s2) {
return s2.length;
};
return function(s2) {
var s2_len = s2.length, s1_idx, s2_idx = 0, a, b, c, c2, i = 0;
if (s2_len === 0)
return s1_len;
while (i < s1_len)
row[i] = ++i;
while (s2_idx < s2_len) {
c2 = s2[s2_idx];
a = s2_idx;
++s2_idx;
b = s2_idx;
for (s1_idx = 0; s1_idx < s1_len; ++s1_idx) {
c = a + (s1[s1_idx] === c2 ? 0 : 1);
a = row[s1_idx];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[s1_idx] = b;
}
}
return b;
};
}
I don't think further optimisations can be made without putting your millions of words in a dedicated data structure (e.g. a prefix trie).