代码之家  ›  专栏  ›  技术社区  ›  DSA

基于在R中不同排列的相同信息的前两列合并两个数据帧

  •  1
  • DSA  · 技术社区  · 7 年前

    我想基于前两列及其值合并两个数据帧,但是,这些列中的值可以根据数据集在列之间切换。所以呢 merge left_join 中的函数 dplyr 包看不到成对信息是相同的。

    为了更好的解释,我在这里定义了两个假设数据集:

    tree.dat1 = data.frame(tree1 = factor(c(rep(33,3),rep(22,2),11)),
    +                       tree2 = factor(c(22,11,44,11,44,44)),
    +                       value = c(0.02, rep(0.03,3), rep(0.01,2)))
    
    > tree.dat1
       tree1 tree2 value
    1    33    22  0.02
    2    33    11  0.03
    3    33    44  0.03
    4    22    11  0.03
    5    22    44  0.01
    6    11    44  0.01
    
    tree.dat2 = data.frame(tree1 = factor(c(rep(11,3),rep(33,2),22)),
    +                        tree2 = factor(c(22,33,44,22,44,44)),
    +                        value1 = c(rep(3,0.05),0.02,rep(0.03,2)))
    > tree.dat2
      tree1 tree2 value1
    1    11    22   0.02
    2    11    33   0.03
    3    11    44   0.03
    4    33    22   0.02
    5    33    44   0.03
    6    22    44   0.03
    

    所以:

    > tree.dat3 = left_join(tree.dat1,tree.dat2, by = c("tree1","tree2"))
    > tree.dat3
       tree1 tree2 value value1
    1    33    22  0.02   0.02
    2    33    11  0.03     NA
    3    33    44  0.03   0.03
    4    22    11  0.03     NA
    5    22    44  0.01   0.03
    6    11    44  0.01   0.03
    

    tree.dat1 数据集。

       tree1 tree2 value value1
    1    33    22  0.02   0.02
    2    33    11  0.03   0.03
    3    33    44  0.03   0.03
    4    22    11  0.03   0.02
    5    22    44  0.01   0.03
    6    11    44  0.01   0.03
    

    因此,我可能正在寻找其他方法来合并两个数据帧,以检查成对信息,而不是两列中的因子级别。因为33-11和11-33是相同的,但是第三列中的值不同。我想知道一个适合大数据集的方法。有什么建议吗?

    2 回复  |  直到 7 年前
        1
  •  4
  •   Mako212    7 年前

    TreeID tree1 tree2

    如果您的数据不是按照 factor min/max as.numeric(as.character(tree.dat1$tree)) 可以 尽最大努力 character numeric ,但我不喜欢,因为 max("11","2")

    library(tidyverse)
    library(stringr)
    
    tree.dat1 = data.frame(tree1 = c(rep(33,3),rep(22,2),11),
                           tree2 = c(22,11,44,11,44,44),
                           value = c(0.02, rep(0.03,3), rep(0.01,2)))
    
    tree.dat2 = data.frame(tree1 = c(rep(11,3),rep(33,2),22),
                           tree2 = c(22,33,44,22,44,44),
                            value1 = c(rep(3,0.05),0.02,rep(0.03,2)))
    

    构造 树ID 树1 . 我们使用 rowwise()

    tree.dat1 <- tree.dat1 %>% rowwise() %>% 
      mutate(TreeID= str_c(min(tree1, tree2), max(tree1,tree2)))
    
    tree.dat2 <- tree.dat2 %>% rowwise() %>% 
      mutate(TreeID= str_c(min(tree1, tree2), max(tree1,tree2)))
    
    left_join(tree.dat1, tree.dat2, by = "TreeID")
    
    
    Source: local data frame [6 x 7]
    Groups: <by row>
    
    # A tibble: 6 x 7
      tree1.x tree2.x value TreeID tree1.y tree2.y value1
        <dbl>   <dbl> <dbl> <chr>    <dbl>   <dbl>  <dbl>
    1      33      22  0.02 2233        33      22   0.02
    2      33      11  0.03 1133        11      33   0.03
    3      33      44  0.03 3344        33      44   0.03
    4      22      11  0.03 1122        11      22   0.02
    5      22      44  0.01 2244        22      44   0.03
    6      11      44  0.01 1144        11      44   0.03
    

    要精确匹配所需输出:

    left_join(tree.dat1, tree.dat2, by = "TreeID") %>% select(-tree1.y, -tree2.y, -TreeID) %>% 
      rename(tree1 = tree1.x, tree2 = tree2.x)
    
      tree1 tree2 value value1
      <dbl> <dbl> <dbl>  <dbl>
    1    33    22  0.02   0.02
    2    33    11  0.03   0.03
    3    33    44  0.03   0.03
    4    22    11  0.03   0.02
    5    22    44  0.01   0.03
    6    11    44  0.01   0.03
    
        2
  •  2
  •   Bing    7 年前

    tree.dat1 = data.frame(tree1 = (c(rep(33,3),rep(22,2),11)),
                           tree2 = (c(22,11,44,11,44,44)),
                           value = c(0.02, rep(0.03,3), rep(0.01,2)))
    
    tree.dat2 = data.frame(tree1 = (c(rep(11,3),rep(33,2),22)),
                            tree2 = (c(22,33,44,22,44,44)),
                            value1 = c(rep(3,0.05),0.02,rep(0.03,2)))
    
    tree.dat1$id=apply(tree.dat1[,1:2], 1, function(x)paste(sort(x), collapse="-"))
    tree.dat2$id=apply(tree.dat2[,1:2], 1, function(x)paste(sort(x), collapse="-"))
    
    tree.dat3 = left_join(tree.dat1,tree.dat2[,3:4], by = "id")[,-4]
    
    
    > tree.dat3
      tree1 tree2 value value1
    1    33    22  0.02   0.02
    2    33    11  0.03   0.03
    3    33    44  0.03   0.03
    4    22    11  0.03   0.02
    5    22    44  0.01   0.03
    6    11    44  0.01   0.03
    
    推荐文章