以下是BigQuery标准SQL
最有可能的是,您的数据中的语言数量并不是事先知道的,所以我建议下面的方法,首先收集数据中的所有语言,并按字母顺序排列,然后针对每一行生成0和1的向量,表示基于它们的位置i存在的各自语言。n基础语言列表
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'French,English' langs UNION ALL
SELECT 'Dutch,French,English' UNION ALL
SELECT 'English' UNION ALL
SELECT 'French,Dutch'
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(SPLIT(langs)) lang
)
)
SELECT langs, all_langs,
(SELECT STRING_AGG(IF(lang IS NULL, '0', '1') ORDER BY pos)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(SPLIT(langs)) lang
ON base_lang = lang
) AS value
FROM `project.dataset.table` t
CROSS JOIN base b
结果是
Row langs all_langs value
1 French,English Dutch,English,French 0,1,1
2 Dutch,French,English Dutch,English,French 1,1,1
3 English Dutch,English,French 0,1,0
4 French,Dutch Dutch,English,French 1,0,1
希望,这将为您的特定用例提供良好的起点。
注:BigQuery不支持本机透视,因此上述方法最有可能是最适合您的
…我的行已经是字符串数组…我有[_法语_,__英语_trade]而不是[_法语,英语_..那这仍然有效吗?
绝对-是的!你唯一需要做的改变就是更换
UNNEST(SPLIT(langs))
具有
UNNEST(langs)
如下面的例子所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT ['French','English'] langs UNION ALL
SELECT ['Dutch','French','English'] UNION ALL
SELECT ['English'] UNION ALL
SELECT ['French','Dutch']
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(langs) lang
)
)
SELECT langs, all_langs,
(SELECT STRING_AGG(IF(lang IS NULL, '0', '1') ORDER BY pos)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(langs) lang
ON base_lang = lang
) AS value
FROM `project.dataset.table` t
CROSS JOIN base b
有结果的
如果一行是[法语、英语、英语]。期望值为0,1,2
见下面的例子
#standardSQL
WITH `project.dataset.table` AS (
SELECT ['French','English','English'] langs UNION ALL
SELECT ['Dutch','French','English'] UNION ALL
SELECT ['English','English'] UNION ALL
SELECT ['French','Dutch']
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(langs) lang
)
)
SELECT langs, all_langs,
ARRAY_TO_STRING(ARRAY(SELECT CAST(SUM(IF(lang IS NULL, 0, 1)) AS STRING)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(langs) lang
ON base_lang = lang
GROUP BY base_lang
ORDER BY MIN(pos)
), ',') AS value
FROM `project.dataset.table` t
CROSS JOIN base b
有结果的