代码之家  ›  专栏  ›  技术社区  ›  MarkeD

在BigQuery标准SQL中合并行,忽略空值并高效地筛选到最完整的行

  •  0
  • MarkeD  · 技术社区  · 6 年前

    这是从以下时间开始的 this question 这让我几乎达到了目标,但我只想返回数据量最大的行。

    从上面链接的问题扩展解决方案示例:

    #standardSQL
    WITH `project.dataset.your_table` AS (
      SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
      SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
      SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
      SELECT 1,     NULL,             NULL,         '4/26'  
      SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 2,     'this_one',        NULL,        NULL,          UNION ALL            
    )
    SELECT id, 
      IFNULL(col_1, FIRST_VALUE(col_1 IGNORE NULLS) OVER(win)) col_1, 
      IFNULL(col_2, FIRST_VALUE(col_2 IGNORE NULLS) OVER(win)) col_2, 
      updated
    FROM `project.dataset.your_table`
    WINDOW win AS (PARTITION BY id ORDER BY updated DESC 
                   ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
    ORDER BY id, updated
    

    我希望结果如下:

    id  col_1       col_2   updated     
    1   next_data   correct 4/26 
    2   this_one    old     4/23
    

    我用 ROW_NUMBER() 它可以工作,但有人建议它不是很有记忆效率:

    SELECT * EXCEPT(n) FROM (
        WITH `project.dataset.your_table` AS (
          SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
          SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
          SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
          SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
          SELECT 1,     NULL,             NULL,         '4/26'         UNION ALL
          SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
          SELECT 2,     'this_one',        NULL,        NULL                     
        )
        SELECT id, 
          IFNULL(col_1, FIRST_VALUE(col_1 IGNORE NULLS) OVER(win)) col_1, 
          IFNULL(col_2, FIRST_VALUE(col_2 IGNORE NULLS) OVER(win)) col_2, 
          updated,
          ROW_NUMBER() OVER (PARTITION BY id) as n
        FROM `project.dataset.your_table`
        WINDOW win AS (PARTITION BY id ORDER BY updated DESC 
                       ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
        ORDER BY id, updated)
     WHERE n = 1
    

    它给出:

    已更新ID列1列2
    1下一个数据正确4/26
    2这个旧的4/23
    
    1 回复  |  直到 6 年前
        1
  •  3
  •   Mikhail Berlyant    6 年前
    #standardSQL
    WITH `project.dataset.your_table` AS (
      SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
      SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
      SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
      SELECT 1,     NULL,             NULL,         '4/26'         UNION ALL
      SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 2,     'this_one',        NULL,        NULL                 
    )
    SELECT id, 
      ARRAY_AGG(col_1 IGNORE NULLS ORDER BY updated DESC LIMIT 1)[SAFE_OFFSET(0)] col_1,
      ARRAY_AGG(col_2 IGNORE NULLS ORDER BY updated DESC LIMIT 1)[SAFE_OFFSET(0)] col_2,
      MAX(updated) updated
    FROM `project.dataset.your_table`
    GROUP BY id   
    

    以结果

    Row id  col_1       col_2       updated  
    1   1   next_data   correct     4/26     
    2   2   this_one    old         4/23