代码之家  ›  专栏  ›  技术社区  ›  Lovethenakedgun

(MDLD)修改的Damerau-Levenshtein距离算法的Javascript版本

  •  0
  • Lovethenakedgun  · 技术社区  · 7 年前

    我想测试MDLD的性能,以便将一些浏览器内字符串比较集成到web应用程序中。该用例涉及到比较字符串,如“300mm,Packed Wall”和“Packed Wall-300mm”,因此我正在寻找模糊字符串匹配,它对标点和拼写有一定的容忍度,并允许块字符转换。

    我无法在线找到Javascript的实现。我发现为PL/SQL编写的版本可以在 CSIRO's Taxamatch Wiki

    这是我尝试将代码转换为JS;基本函数的结果似乎相当准确,但块转置计算并没有给出预期的结果。E、 g.“Hi-There”vs“There-Hi”返回“6”,无论块限制设置为什么。

    如果有人知道一个有效的实现,你能给我指一下吗?或者,我的改编或源代码本身有什么问题?我所做的唯一主要更改是在两个示例中使用“Math.ceil()”,其中源代码似乎使用整数除法,这总是占主导地位--这会导致输入出现奇怪的问题,导致出现1个字符串--但似乎不会影响我测试过的其他案例的行为。

    function mdld(str1, str2, block_lim)
    {
        mycol = [];
        cost = 0;
        len1 = str1.length;
        len2 = str2.length;
    
        if( str1 === str2 )
            return 0;
        else if ( len1 === 0 || len2 === 0 )
            return Math.max(len1, len2);
        else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
            return 1;
        else
        {
            // Temporary strings which will be pre-processed
            // Speeds up calcs & retains correct measurement
            temp1 = str1;
            temp2 = str2;
    
            // Trim any common initial characters
            while ( temp1.substr(0,1) === temp2.substr(0,1) )
            {
                temp1 = temp1.substr(1, temp1.length);
                temp2 = temp2.substr(1, temp2.length);
            }
    
            // Trim any trailing characters
            while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
            {
                temp1 = temp1.substr(0,temp1.length-1);
                temp2 = temp2.substr(0,temp2.length-1);
            }
    
            len1 = temp1.length;
            len2 = temp2.length;
    
            // Calc Levenshtein Distance
            if (len1 === 0 || len2 === 0)
                return Math.max(len1, len2);
            else if (len1 === 1 && len2 === 1 && str1 !== str2)
                return 1;
            else
            {
                // Create columns
                var s, t;
                for(s = 0; s <= len1; s++)
                    mycol[s] = [];
    
                // Enter values into leftmost column
                for(t = 0; t <= len2; t++)
                    mycol[0][t] = t;
    
                // Populate remaining columns
                for(s = 1; s <= len1; s++)
                {
                    mycol[s][0] = s;
                    // Populate first row (each cell of one column)
                    for(t = 1; t <= len2; t++)
                    {
                        //Calculate cost
                        if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
                            cost = 0;
                        else
                            cost = 1;
    
                        // extension to cover multiple character transpositions
                        // that includes calculation of original Levenshtein distance when no transposition 
                        tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
    
                        while (tempBlockLen >= 1)
                        {
                            if ((s >= tempBlockLen * 2) && 
                                (t >= tempBlockLen * 2) &&
                                (temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
                                (temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
                            {
                                // Transposition found
                                mycol[s][t] = Math.min( mycol[s][t-1] + 1,
                                                        mycol[s-1][t] + 1,
                                                        mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
                                tempBlockLen = 0;
                            }
                            else if (tempBlockLen === 1)
                            {
                                // No Transposition
                                mycol[s][t] = Math.min( mycol[s][t-1] + 1,
                                                        mycol[s-1][t] + 1,
                                                        mycol[s-1][t-1] + cost );
                            }
                            tempBlockLen = tempBlockLen - 1;    
                        }
                    }
                }
            }
            return mycol[len1][len2];
        }
    }
    
    1 回复  |  直到 7 年前
        1
  •  0
  •   Lovethenakedgun    7 年前

    最后,我无法弄清楚我对CSIRO代码的改编有什么问题。找到了一个用Ruby扩展在C中实现该函数的github repo, https://github.com/GlobalNamesArchitecture/damerau-levenshtein

    对其进行了调整,以获得功能实现。看起来很好,但对我的用例来说不是很好。MDLD可以交换文本块,但仅在不需要多次连续交换来构造源字符串的情况下。我们来看看N-gram。

    对于那些感兴趣的人来说,这是我的最终结果。就性能而言,在块限制为5的情况下,它在大约5秒内比较了1000、20-40个字符串。

    function method_internal_distance(s, t, block_size, max_distance)
    {
        s = s.toLocaleLowerCase();
        t = t.toLocaleLowerCase();
        var pure_levenshtein = false;
        var stop_execution = false;
        var sl = s.length;
        var tl = t.length;
    
        var i, i1, j, j1, k, half_sl, half_tl, cost;
        var distance, del, ins, subs, transp, block;
        var current_distance = 0;
        var min = 0;
    
        if (block_size == 0)
        {
            pure_levenshtein = true;
            block_size = 1;
        }
    
        if (sl == 0) 
            return tl;
        if (tl == 0)
            return sl;
    
        // case of lengths 1 must present or it will break further in the code
        if( sl == 1 && tl ==1 && s != t)
            return 1;
    
        // Increment length value of both strings, to include 0 for matrix
        sl++;
        tl++;
    
        //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
        d = [];
    
        //populate 'vertical' row starting from the 2nd position (first one is filled already)
        for(i=0; i < tl; i++)
        {
            d[i*sl] = i;
        }
    
        //fill up array with scores
        for(i = 1; i<sl; i++)
        {
            d[i] = i;
            if (stop_execution)
                break;
    
            current_distance = 10000;
            for(j=1; j<tl; j++)
            {
                cost = 1;
                if(s[i-1] == t[j-1])
                    cost = 0;
    
                half_sl = Math.floor((sl-1)/2);
                half_tl = Math.floor((tl-1)/2);
    
                block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
                block = block < half_tl || half_tl == 0 ? block : half_tl;
    
                while (block >=1)
                {
                    var swap1 = 1;
                    var swap2 = 1;
    
                    i1 = i - (block * 2);
                    j1 = j - (block * 2);
    
                    for (k = i1; k < (i1 + block); k++)
                    {
                        if(s[k] != t[k+block])
                        {
                            swap = 0;
                            break;
                        }
                    }
    
                    for (k = j1; k < (j1 + block); k++)
                    {
                        if (t[k] != s[k+block])
                        {
                            swap2 = 0;
                            break;
                        }
                    }
    
                    del = d[j*sl + i - 1] + 1;
                    ins = d[(j-1)*sl + i] + 1;
                    min = del;
    
                    if (ins < min)
                        min = ins;
    
                    if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
                    {
                        transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
                        if (transp < min)
                            min = transp;
                        block = 0;
                    }
                    else if (block == 1)
                    {
                        subs = d[(j-1)*sl + i - 1] + cost;
                        if (subs < min)
                            min = subs;
                    }
                    block--;
                }
                d[j*sl+i] = min;
                if (current_distance > d[j*sl+i])
                    current_distance = d[j*sl+i];
            }
            if (current_distance > max_distance)
                stop_execution = true;
        }
        distance = d[sl*tl-1];
        if (stop_execution)
            distance = current_distance;
    
        return distance;
    }