代码之家  ›  专栏  ›  技术社区  ›  LocoGris

合并表:根据连接行的方式对输出进行分类

  •  0
  • LocoGris  · 技术社区  · 6 年前

    在R中连接(大型、复杂)表时,我通常很难检查此操作的结果。 这里是一个最小的可复制示例:

    library(data.table) 
    table1 <- data.table(id=c("A", "B", "C"), price=c(12,11,10))
    table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"))
    
    
    table_join <- merge(table1,table2,
                        by="id",
                        all.x=T,
                        all.y=T)
    

    structure(list(id = c("A", "B", "C", "C", "D"), price = c(12, 
                                                              11, 10, 10, NA), wharehouse = c("Colorado", NA, "Texas", "New York", 
                                                                                              "Oregon"), join = c("INNER JOIN", "LEFT JOIN", "INNER JOIN. MULTIPLE RIGHT JOIN", 
                                                                                                                  "INNER JOIN. MULTIPLE RIGHT JOIN", "RIGHT JOIN")), row.names = c(NA, 
                                                                                                                                                                                   -5L), class = c("data.table", "data.frame"))
    

    但是我想知道有多少行与另一个表不匹配,有一个匹配,多个匹配。。。

    我想一些信息(可能是一个新行)描述如何从合并结果。请参阅一个可能的示例:

    table_join[1, join:="INNER JOIN"]
    table_join[2, join:="LEFT JOIN"]
    table_join[3, join:="INNER JOIN. MULTIPLE RIGHT JOIN"]
    table_join[4, join:="INNER JOIN. MULTIPLE RIGHT JOIN"]
    table_join[5, join:="RIGHT JOIN"]
    

    这里是“预期产出”

    structure(list(id = c("A", "B", "C", "C", "D"), price = c(12, 
    11, 10, 10, NA), wharehouse = c("Colorado", NA, "Texas", "New York", 
    "Oregon"), join = c("INNER JOIN", "LEFT JOIN", "INNER JOIN. MULTIPLE RIGHT JOIN", 
    "INNER JOIN. MULTIPLE RIGHT JOIN", "RIGHT JOIN")), row.names = c(NA, 
    -5L), class = c("data.table", "data.frame"))
    

    id 存在于另一个表中,但具有 NAs (在我的例子中,假设我们有id D 但是价格 NA ).

    此外,它还可以帮助我在合并多个表时跟踪复杂的情况

    0 回复  |  直到 6 年前
        1
  •  1
  •   Gerald T    6 年前

    问题本身很简单,通过跟踪每个表的id频率就可以解决。下面是我的解决方案,但可能需要对大型表进行一些优化。

    编辑1:

    错误修复:类别被覆盖; eval(track.col) 而不是 eval(parse(text = track.col)))

    自定义名称 join 列。

    library(data.table)
    
    track.merge <- function(x, y, on, track.col){
    
      x[, N := .N, by = on][]
      y[, N := .N, by = on][]
    
      table_join <- merge(x, y, by=on, all.x=T, all.y=T)
    
      x[, N := NULL, by = on][]
      y[, N := NULL, by = on][]
    
      table_join[N.x > 1 & N.y > 1,                              
                 eval(track.col) := "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"][]
    
      table_join[N.x > 1 & is.na(eval(parse(text = track.col))), 
                 eval(track.col) := "INNER JOIN. MULTIPLE LEFT JOIN"][]
    
      table_join[N.y > 1 & is.na(eval(parse(text = track.col))), 
                 eval(track.col) := "INNER JOIN. MULTIPLE RIGHT JOIN"][]
    
      table_join[is.na(N.x),                                     
                 eval(track.col) := "RIGHT JOIN"][]
    
      table_join[is.na(N.y),                                
                 eval(track.col) := "LEFT JOIN"][]
    
      table_join[is.na(eval(parse(text = track.col))),      
                 eval(track.col) := "INNER JOIN"][]
    
      table_join[, ':=' (N.x = NULL, N.y = NULL)][]
    }
    

    编辑2

    同一函数的可读性更强的版本

    track.merge2 <- function(x, y, on, track.col){
    
      x[, N := .N, by = on][]
      y[, N := .N, by = on][]
    
      table_join <- merge(x, y, by=on, all.x=T, all.y=T)
      track_ids <- character(NROW(table_join))
    
      x[, N := NULL, by = on][]
      y[, N := NULL, by = on][]
    
      track_ids[table_join$N.x > 1 & table_join$N.y > 1] <- "INNER JOIN. MULTIPLE LEFT RIGHT JOIN"
      track_ids[table_join$N.x > 1 & track_ids == ""]    <- "INNER JOIN. MULTIPLE LEFT JOIN"
      track_ids[table_join$N.y > 1 & track_ids == ""]    <- "INNER JOIN. MULTIPLE RIGHT JOIN"
      track_ids[is.na(table_join$N.x)]                   <- "RIGHT JOIN"
      track_ids[is.na(table_join$N.y)]                   <- "LEFT JOIN"
      track_ids[track_ids == ""]                         <- "INNER JOIN"
    
      table_join[[track.col]] <- track_ids
      table_join[, ':=' (N.x = NULL, N.y = NULL)][]
    }
    

    测试:

    table1 <- data.table(id=c("A", "C", "C", "B", "F", "H", "H"), price=c(12,11,10,13,10,15,3)) 
    table2 <- data.table(id=c("A", "C", "C", "F", "F", "H", "L"), wharehouse=c("Colorado","Texas","New York", "Washington", "Illinois", "Florida", "Kansas")) 
    
    > table1
       id price
    1:  A    12
    2:  C    11
    3:  C    10
    4:  B    13
    5:  F    10
    6:  H    15
    7:  H     3
    
    > table2
       id wharehouse
    1:  A   Colorado
    2:  C      Texas
    3:  C   New York
    4:  F Washington
    5:  F   Illinois
    6:  H    Florida
    7:  L     Kansas
    
    > track.merge(table1, table2, "id", "join")
        id price wharehouse                                 join
     1:  A    12   Colorado                           INNER JOIN
     2:  B    13       <NA>                            LEFT JOIN
     3:  C    11      Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
     4:  C    11   New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
     5:  C    10      Texas INNER JOIN. MULTIPLE LEFT RIGHT JOIN
     6:  C    10   New York INNER JOIN. MULTIPLE LEFT RIGHT JOIN
     7:  F    10 Washington      INNER JOIN. MULTIPLE RIGHT JOIN
     8:  F    10   Illinois      INNER JOIN. MULTIPLE RIGHT JOIN
     9:  H    15    Florida       INNER JOIN. MULTIPLE LEFT JOIN
    10:  H     3    Florida       INNER JOIN. MULTIPLE LEFT JOIN
    11:  L    NA     Kansas                           RIGHT JOIN
    
    
    > all.equal(track.merge2(x, y, on = "id", "join"), track.merge(x, y, on = "id", "join"))
    [1] TRUE
    

    考虑到具有1000k行和两列的表,合并要慢2.5倍:

    library(microbenchmark)
    
    table1 <- data.table(id = sample(1e+6, 1e+6, replace = T), price = rnorm(1e+6))
    table2 <- data.table(id = sample(1e+6, 1e+6, replace = T), state = sample(LETTERS, 1e+6, replace = T))
    
    microbenchmark(merge        = merge(table1,table2, by="id", all.x=T, all.y=T),
                   track.merge  = track.merge(table1, table2, "id", "join"), 
                   track.merge2 = track.merge2(table1, table2, "id", "join"), 
                   times = 10L)
    
             expr       min       lq      mean    median        uq       max neval cld
            merge  569.7727  573.433  577.8784  577.2759  581.9219  586.9951    10 a  
      track.merge 1456.4417 1536.982 1545.6427 1556.5226 1563.6150 1623.3078    10   c
     track.merge2 1392.6832 1464.968 1460.2484 1471.0332 1477.2330 1487.1828    10  b 
    

        2
  •  1
  •   Lala La    6 年前

    这是我的解决方案使用 dplyr

    你可以用这个代码得到频率表。

    library(tidyverse)
    table1 %>% left_join(table2) %>% 
          group_by(id) %>%
          summarise(num_wharehouse = sum(!is.na(wharehouse))) 
    
    Joining, by = "id"
    # A tibble: 3 x 2
      id    num_wharehouse
      <chr>          <int>
    1 A                  1
    2 B                  0
    3 C                  2
    

    table1 %>% left_join(table2) %>% 
              group_by(id) %>%
              summarise(num_wharehouse = sum(!is.na(wharehouse))) %>%
              summarise(merged = sum(num_wharehouse > 0),
                        not_merged = sum(num_wharehouse == 0), 
                        single_match = sum(num_wharehouse == 1),
                        multi_match = sum(num_wharehouse > 1))
    
    Joining, by = "id"
    # A tibble: 1 x 4
      merged not_merged single_match multi_match
       <int>      <int>        <int>       <int>
    1      2          1            1           1
    
        3
  •  1
  •   moodymudskipper    6 年前

    我的包裹 safejoin

    # devtools::install_github("moodymudskipper/safejoin")
    library(safejoin)
    safe_full_join(table1, table2, check="uvmn")
      id price wharehouse
    1  A    12   Colorado
    2  B    11       <NA>
    3  C    10      Texas
    4  C    10   New York
    5  D    NA     Oregon
    Warning messages:
    1: x has unmatched sets of joining values: 
        id
    1:  B 
    2: y has unmatched sets of joining values: 
        id
    1:  D 
    3: y is not unique on id 
    

    检查由单个字符串参数(即字符序列)处理,其中大写字母触发失败,小写字母触发警告,字母前缀为 ~ there are more )具体如下:

    • "u" 像unique一样检查连接列是否在x上形成唯一键
    • "v"
    • "m" like match检查x的所有行是否都匹配
    • "n" 检查y的所有行是否匹配
        4
  •  0
  •   Soren    6 年前

    包装器函数可以在连接之后应用,以分析其路径,如下所示。OP和评论中给出的场景:

    #Scenario 1
    table1 <- data.table(id=c("A", "B", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"));
    
    #Scenario 2
    table1 <- data.table(id=c("C", "C", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", "D"), wharehouse=c("Colorado","Texas","New York", "Oregon"));
    
    #Scenario 3
    table1 <- data.table(id=c(NA, "C", "C"), price=c(12,11,10)); table2 <- data.table(id=c("A", "C", "C", NA), wharehouse=c("Colorado","Texas","New York", "Oregon")) 
    
    #Scenario 4
    table1 <- data.table(id=c("A", "A", "C"), price=c(12,11,10)); table2 <- data.table(id=c("B", "C", "C","D"), wharehouse=c("Colorado","Texas","New York", "Oregon")) 
    
    setkeyv(table1,"id")
    setkeyv(table2,"id")
    table_join  <- merge(table1,table2,by="id",all.x=T,all.y=T)
    
    write_description <- function(p,w,n) { 
      inners <- (!is.na(p) & !is.na(w))
      lefts <-  (!is.na(p) & is.na(w))
      rights <- ((is.na(p) & !is.na(w))) | (n > 1 & !is.na(w))
      multis <- n > 1
    
      paste0(ifelse(inners,"INNER JOIN ",""),
             ifelse(multis,"MULTIPLE ",""),
             ifelse(lefts,"LEFT JOIN ",""),
             ifelse(rights,"RIGHT JOIN ",""))
    }
    
    
    table_join[,description:=write_description(price,wharehouse,.N),by="id"]
    

    结果场景1:

    > table_join
       id price wharehouse                     description
    1:  A    12   Colorado                     INNER JOIN 
    2:  B    11         NA                      LEFT JOIN 
    3:  C    10      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    4:  C    10   New York INNER JOIN MULTIPLE RIGHT JOIN 
    5:  D    NA     Oregon                     RIGHT JOIN 
    

    > table_join
       id price wharehouse                     description
    1:  A    NA   Colorado                     RIGHT JOIN 
    2:  C    12      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    3:  C    12   New York INNER JOIN MULTIPLE RIGHT JOIN 
    4:  C    11      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    5:  C    11   New York INNER JOIN MULTIPLE RIGHT JOIN 
    6:  C    10      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    7:  C    10   New York INNER JOIN MULTIPLE RIGHT JOIN 
    8:  D    NA     Oregon                     RIGHT JOIN 
    

    结果:情景3

    > table_join
       id price wharehouse                     description
    1: NA    12     Oregon                     INNER JOIN 
    2:  A    NA   Colorado                     RIGHT JOIN 
    3:  C    11      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    4:  C    11   New York INNER JOIN MULTIPLE RIGHT JOIN 
    5:  C    10      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    6:  C    10   New York INNER JOIN MULTIPLE RIGHT JOIN 
    

    结果:情景4

    > table_join
       id price wharehouse                     description
    1:  A    12         NA             MULTIPLE LEFT JOIN 
    2:  A    11         NA             MULTIPLE LEFT JOIN 
    3:  B    NA   Colorado                     RIGHT JOIN 
    4:  C    10      Texas INNER JOIN MULTIPLE RIGHT JOIN 
    5:  C    10   New York INNER JOIN MULTIPLE RIGHT JOIN 
    6:  D    NA     Oregon                     RIGHT JOIN