最后,我无法弄清楚我对CSIRO代码的改编有什么问题。找到了一个用Ruby扩展在C中实现该函数的github repo,
https://github.com/GlobalNamesArchitecture/damerau-levenshtein
。
对其进行了调整,以获得功能实现。看起来很好,但对我的用例来说不是很好。MDLD可以交换文本块,但仅在不需要多次连续交换来构造源字符串的情况下。我们来看看N-gram。
对于那些感兴趣的人来说,这是我的最终结果。就性能而言,在块限制为5的情况下,它在大约5秒内比较了1000、20-40个字符串。
function method_internal_distance(s, t, block_size, max_distance)
{
s = s.toLocaleLowerCase();
t = t.toLocaleLowerCase();
var pure_levenshtein = false;
var stop_execution = false;
var sl = s.length;
var tl = t.length;
var i, i1, j, j1, k, half_sl, half_tl, cost;
var distance, del, ins, subs, transp, block;
var current_distance = 0;
var min = 0;
if (block_size == 0)
{
pure_levenshtein = true;
block_size = 1;
}
if (sl == 0)
return tl;
if (tl == 0)
return sl;
// case of lengths 1 must present or it will break further in the code
if( sl == 1 && tl ==1 && s != t)
return 1;
// Increment length value of both strings, to include 0 for matrix
sl++;
tl++;
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
d = [];
//populate 'vertical' row starting from the 2nd position (first one is filled already)
for(i=0; i < tl; i++)
{
d[i*sl] = i;
}
//fill up array with scores
for(i = 1; i<sl; i++)
{
d[i] = i;
if (stop_execution)
break;
current_distance = 10000;
for(j=1; j<tl; j++)
{
cost = 1;
if(s[i-1] == t[j-1])
cost = 0;
half_sl = Math.floor((sl-1)/2);
half_tl = Math.floor((tl-1)/2);
block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
block = block < half_tl || half_tl == 0 ? block : half_tl;
while (block >=1)
{
var swap1 = 1;
var swap2 = 1;
i1 = i - (block * 2);
j1 = j - (block * 2);
for (k = i1; k < (i1 + block); k++)
{
if(s[k] != t[k+block])
{
swap = 0;
break;
}
}
for (k = j1; k < (j1 + block); k++)
{
if (t[k] != s[k+block])
{
swap2 = 0;
break;
}
}
del = d[j*sl + i - 1] + 1;
ins = d[(j-1)*sl + i] + 1;
min = del;
if (ins < min)
min = ins;
if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
{
transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
if (transp < min)
min = transp;
block = 0;
}
else if (block == 1)
{
subs = d[(j-1)*sl + i - 1] + cost;
if (subs < min)
min = subs;
}
block--;
}
d[j*sl+i] = min;
if (current_distance > d[j*sl+i])
current_distance = d[j*sl+i];
}
if (current_distance > max_distance)
stop_execution = true;
}
distance = d[sl*tl-1];
if (stop_execution)
distance = current_distance;
return distance;
}