contains
而是使用LevsTein in [编辑]距离考虑下面的方法
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_string1;
try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}
try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}
return Levenshtein.get(the_string1, the_string2)
""";
WITH `project.dataset.fuzzylog` AS (
SELECT 1 id, 'John Smith' fullname UNION ALL
SELECT 2, 'Jane Doe' UNION ALL
SELECT 3, 'Ms. Jane Doe' UNION ALL
SELECT 4, 'Mr. John Smith' UNION ALL
SELECT 5, 'Smith, John' UNION ALL
SELECT 6, 'J.Smith' UNION ALL
SELECT 7, 'J. Doe'
), `project.dataset.names` AS (
SELECT 1 nameid, 'John Smith' fullname, 'smith' match UNION ALL
SELECT 2, 'Jane Doe', 'doe'
), notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
-- ORDER BY 1
结果如下:
Row id logged_fullname nameid fullname
1 1 John Smith 1 John Smith
2 2 Jane Doe 2 Jane Doe
3 3 Ms. Jane Doe 2 Jane Doe
4 4 Mr. John Smith 1 John Smith
5 5 Smith, John 1 John Smith
6 6 J.Smith 1 John Smith
7 7 J. Doe 2 Jane Doe
)而不是应用Levenshtein距离来直接度量两个全名之间的相似性。如您所见,在执行此操作之前,我们将对全名进行重新排序/规范化,以对其部分进行排序
如果这种方法对你有用的话,你应该考虑通过重新删除/替换所有标点符号(如点、逗号等)来改进重新排序,以获得更好的结果。