问题本身很简单,通过跟踪每个表的id频率就可以解决。下面是我的解决方案,但可能需要对大型表进行一些优化。
编辑1:
错误修复:类别被覆盖;
eval(track.col)
而不是
eval(parse(text = track.col)))
自定义名称
join
列。
library(data.table)
track.merge <- function(x, y, on, track.col){
x[, N := .N, by = on][]
y[, N := .N, by = on][]
table_join <- merge(x, y, by=on, all.x=T, all.y=T)
x[, N := NULL, by = on][]
y[, N := NULL, by = on][]
table_join[N.x > 1 & N.y > 1,
eval(track.col) := "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"][]
table_join[N.x > 1 & is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN. MULTIPLE LEFT JOIN"][]
table_join[N.y > 1 & is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN. MULTIPLE RIGHT JOIN"][]
table_join[is.na(N.x),
eval(track.col) := "RIGHT JOIN"][]
table_join[is.na(N.y),
eval(track.col) := "LEFT JOIN"][]
table_join[is.na(eval(parse(text = track.col))),
eval(track.col) := "INNER JOIN"][]
table_join[, ':=' (N.x = NULL, N.y = NULL)][]
}
编辑2
同一函数的可读性更强的版本
track.merge2 <- function(x, y, on, track.col){
x[, N := .N, by = on][]
y[, N := .N, by = on][]
table_join <- merge(x, y, by=on, all.x=T, all.y=T)
track_ids <- character(NROW(table_join))
x[, N := NULL, by = on][]
y[, N := NULL, by = on][]
track_ids[table_join$N.x > 1 & table_join$N.y > 1] <- "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"
track_ids[table_join$N.x > 1 & track_ids == ""] <- "INNER JOIN. MULTIPLE LEFT JOIN"
track_ids[table_join$N.y > 1 & track_ids == ""] <- "INNER JOIN. MULTIPLE RIGHT JOIN"
track_ids[is.na(table_join$N.x)] <- "RIGHT JOIN"
track_ids[is.na(table_join$N.y)] <- "LEFT JOIN"
track_ids[track_ids == ""] <- "INNER JOIN"
table_join[[track.col]] <- track_ids
table_join[, ':=' (N.x = NULL, N.y = NULL)][]
}
测试:
table1 <- data.table(id=c("A", "C", "C", "B", "F", "H", "H"), price=c(12,11,10,13,10,15,3))
table2 <- data.table(id=c("A", "C", "C", "F", "F", "H", "L"), wharehouse=c("Colorado","Texas","New York", "Washington", "Illinois", "Florida", "Kansas"))
> table1
id price
1: A 12
2: C 11
3: C 10
4: B 13
5: F 10
6: H 15
7: H 3
> table2
id wharehouse
1: A Colorado
2: C Texas
3: C New York
4: F Washington
5: F Illinois
6: H Florida
7: L Kansas
> track.merge(table1, table2, "id", "join")
id price wharehouse join
1: A 12 Colorado INNER JOIN
2: B 13 <NA> LEFT JOIN
3: C 11 Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
4: C 11 New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
5: C 10 Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
6: C 10 New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
7: F 10 Washington INNER JOIN. MULTIPLE RIGHT JOIN
8: F 10 Illinois INNER JOIN. MULTIPLE RIGHT JOIN
9: H 15 Florida INNER JOIN. MULTIPLE LEFT JOIN
10: H 3 Florida INNER JOIN. MULTIPLE LEFT JOIN
11: L NA Kansas RIGHT JOIN
> all.equal(track.merge2(x, y, on = "id", "join"), track.merge(x, y, on = "id", "join"))
[1] TRUE
考虑到具有1000k行和两列的表,合并要慢2.5倍:
library(microbenchmark)
table1 <- data.table(id = sample(1e+6, 1e+6, replace = T), price = rnorm(1e+6))
table2 <- data.table(id = sample(1e+6, 1e+6, replace = T), state = sample(LETTERS, 1e+6, replace = T))
microbenchmark(merge = merge(table1,table2, by="id", all.x=T, all.y=T),
track.merge = track.merge(table1, table2, "id", "join"),
track.merge2 = track.merge2(table1, table2, "id", "join"),
times = 10L)
expr min lq mean median uq max neval cld
merge 569.7727 573.433 577.8784 577.2759 581.9219 586.9951 10 a
track.merge 1456.4417 1536.982 1545.6427 1556.5226 1563.6150 1623.3078 10 c
track.merge2 1392.6832 1464.968 1460.2484 1471.0332 1477.2330 1487.1828 10 b