代码之家  ›  专栏  ›  技术社区  ›  Neil C. Obremski

如何在BigQuery中使用标准SQL左连接的ON子句中的字符串函数?

  •  0
  • Neil C. Obremski  · 技术社区  · 6 年前

    我在使用字符串函数时遇到了麻烦,比如 STARTS_WITH 或操作员,如 LIKE 在一个 LEFT JOIN ON

    让我们考虑一张桌子 fullname 我想通过加入 具有相同列的表。fuzzylog中的键字段可能有点乱,或者是自由形式的,因此不可能直接进行相等联接。这些表可能如下所示:

    fuzzylog 表格:

    fuzzylog table

    names 表格:

    names table

    names表通过提供一个 contains 就像 如果完全匹配失败:

    #standardSQL
    SELECT l.id, n.fullname, n.nameid,
      l.fullname AS logged_fullname
    FROM `neilotemp.fuzzylog` l
    LEFT JOIN `neilotemp.names` n
      ON l.fullname = n.fullname
      OR l.fullname LIKE CONCAT('%', n.contains, '%')
    

    如果没有连接两侧字段相等的条件,则不能使用左外连接。

    2 回复  |  直到 6 年前
        1
  •  3
  •   Mikhail Berlyant    6 年前

    contains
    而是使用LevsTein in [编辑]距离考虑下面的方法

    #standardSQL
    CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
    RETURNS INT64
    LANGUAGE js AS """
      var _extend = function(dst) {
        var sources = Array.prototype.slice.call(arguments, 1);
        for (var i=0; i<sources.length; ++i) {
          var src = sources[i];
          for (var p in src) {
            if (src.hasOwnProperty(p)) dst[p] = src[p];
          }
        }
        return dst;
      };
    
      var Levenshtein = {
        /**
         * Calculate levenshtein distance of the two strings.
         *
         * @param str1 String the first string.
         * @param str2 String the second string.
         * @return Integer the levenshtein distance (0 and above).
         */
        get: function(str1, str2) {
          // base cases
          if (str1 === str2) return 0;
          if (str1.length === 0) return str2.length;
          if (str2.length === 0) return str1.length;
    
          // two rows
          var prevRow  = new Array(str2.length + 1),
              curCol, nextCol, i, j, tmp;
    
          // initialise previous row
          for (i=0; i<prevRow.length; ++i) {
            prevRow[i] = i;
          }
    
          // calculate current row distance from previous row
          for (i=0; i<str1.length; ++i) {
            nextCol = i + 1;
    
            for (j=0; j<str2.length; ++j) {
              curCol = nextCol;
    
              // substution
              nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
              // insertion
              tmp = curCol + 1;
              if (nextCol > tmp) {
                nextCol = tmp;
              }
              // deletion
              tmp = prevRow[j + 1] + 1;
              if (nextCol > tmp) {
                nextCol = tmp;
              }
    
              // copy current col value into previous (in preparation for next iteration)
              prevRow[j] = curCol;
            }
    
            // copy last col value into previous (in preparation for next iteration)
            prevRow[j] = nextCol;
          }
    
          return nextCol;
        }
    
      };
    
      var the_string1;
    
      try {
        the_string1 = decodeURI(string1).toLowerCase();
      } catch (ex) {
        the_string1 = string1.toLowerCase();
      }
    
      try {
        the_string2 = decodeURI(string2).toLowerCase();
      } catch (ex) {
        the_string2 = string2.toLowerCase();
      }
    
      return Levenshtein.get(the_string1, the_string2) 
    
    """;   
    WITH notrmalized_fuzzylog as (
      select id, fullname, 
        (select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
      from `project.dataset.fuzzylog`
    ), normalized_names as (
      select nameid, fullname, 
        (select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
      from `project.dataset.names`
    )
    select
      id, l.fullname AS logged_fullname,
      ARRAY_AGG(
        STRUCT(n.nameid, n.fullname)
        ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
      )[OFFSET(0)].*
    FROM notrmalized_fuzzylog l
    CROSS JOIN normalized_names n
    GROUP BY 1, 2
    

    #standardSQL
    CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
    RETURNS INT64
    LANGUAGE js AS """
      var _extend = function(dst) {
        var sources = Array.prototype.slice.call(arguments, 1);
        for (var i=0; i<sources.length; ++i) {
          var src = sources[i];
          for (var p in src) {
            if (src.hasOwnProperty(p)) dst[p] = src[p];
          }
        }
        return dst;
      };
    
      var Levenshtein = {
        /**
         * Calculate levenshtein distance of the two strings.
         *
         * @param str1 String the first string.
         * @param str2 String the second string.
         * @return Integer the levenshtein distance (0 and above).
         */
        get: function(str1, str2) {
          // base cases
          if (str1 === str2) return 0;
          if (str1.length === 0) return str2.length;
          if (str2.length === 0) return str1.length;
    
          // two rows
          var prevRow  = new Array(str2.length + 1),
              curCol, nextCol, i, j, tmp;
    
          // initialise previous row
          for (i=0; i<prevRow.length; ++i) {
            prevRow[i] = i;
          }
    
          // calculate current row distance from previous row
          for (i=0; i<str1.length; ++i) {
            nextCol = i + 1;
    
            for (j=0; j<str2.length; ++j) {
              curCol = nextCol;
    
              // substution
              nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
              // insertion
              tmp = curCol + 1;
              if (nextCol > tmp) {
                nextCol = tmp;
              }
              // deletion
              tmp = prevRow[j + 1] + 1;
              if (nextCol > tmp) {
                nextCol = tmp;
              }
    
              // copy current col value into previous (in preparation for next iteration)
              prevRow[j] = curCol;
            }
    
            // copy last col value into previous (in preparation for next iteration)
            prevRow[j] = nextCol;
          }
    
          return nextCol;
        }
    
      };
    
      var the_string1;
    
      try {
        the_string1 = decodeURI(string1).toLowerCase();
      } catch (ex) {
        the_string1 = string1.toLowerCase();
      }
    
      try {
        the_string2 = decodeURI(string2).toLowerCase();
      } catch (ex) {
        the_string2 = string2.toLowerCase();
      }
    
      return Levenshtein.get(the_string1, the_string2) 
    
    """;   
    WITH `project.dataset.fuzzylog` AS (
      SELECT 1 id, 'John Smith' fullname UNION ALL
      SELECT 2, 'Jane Doe' UNION ALL
      SELECT 3, 'Ms. Jane Doe' UNION ALL
      SELECT 4, 'Mr. John Smith' UNION ALL
      SELECT 5, 'Smith, John' UNION ALL
      SELECT 6, 'J.Smith' UNION ALL
      SELECT 7, 'J. Doe'
    ), `project.dataset.names` AS (
      SELECT 1 nameid, 'John Smith' fullname, 'smith' match UNION ALL
      SELECT 2, 'Jane Doe', 'doe'
    ), notrmalized_fuzzylog as (
      select id, fullname, 
        (select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
      from `project.dataset.fuzzylog`
    ), normalized_names as (
      select nameid, fullname, 
        (select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
      from `project.dataset.names`
    )
    select
      id, l.fullname AS logged_fullname,
      ARRAY_AGG(
        STRUCT(n.nameid, n.fullname)
        ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
      )[OFFSET(0)].*
    FROM notrmalized_fuzzylog l
    CROSS JOIN normalized_names n
    GROUP BY 1, 2
    -- ORDER BY 1
    

    结果如下:

    Row id  logged_fullname nameid  fullname     
    1   1   John Smith      1       John Smith   
    2   2   Jane Doe        2       Jane Doe     
    3   3   Ms. Jane Doe    2       Jane Doe     
    4   4   Mr. John Smith  1       John Smith   
    5   5   Smith, John     1       John Smith   
    6   6   J.Smith         1       John Smith   
    7   7   J. Doe          2       Jane Doe     
    

    )而不是应用Levenshtein距离来直接度量两个全名之间的相似性。如您所见,在执行此操作之前,我们将对全名进行重新排序/规范化,以对其部分进行排序
    如果这种方法对你有用的话,你应该考虑通过重新删除/替换所有标点符号(如点、逗号等)来改进重新排序,以获得更好的结果。

        2
  •  2
  •   Gordon Linoff    6 年前

    您可以将其用于数组和相关子查询:

    with fuzzylog as (
          select 1 as id, 'John Smith' as fullname union all
          select 2 as id, 'Jane Doe' UNION ALL
          select 6 as id, 'J. Smith'
         ),
         names as (
          select 1 as nameid, 'John Smith' as fullname, 'smith' as word
         )
    select l.id, l.fullname, n.fullname as name_fullname, n.nameid
    from (SELECT l.*, 
                 (SELECT array_agg(n.nameid)
                  from names n
                  where l.fullname = n.fullname OR lower(l.fullname) LIKE CONCAT('%', lower(n.word), '%')  
                 ) nameids
           FROM fuzzylog l
          ) l LEFT JOIN
          unnest(l.nameids) the_nameid left join
          names n
          on n.nameid = the_nameid;