代码之家  ›  专栏  ›  技术社区  ›  Dan

在联接期间保留两个数据表中的所有列,然后添加一列

  •  1
  • Dan  · 技术社区  · 7 年前

    # Seed random number generator
    set.seed(33550336)
    
    # Create data tables
    dt1 <- data.table(ID = sample(LETTERS[1:5], 20, replace = TRUE),
                      loc = sample(1:50, 20, replace = TRUE),
                      a = runif(20),
                      b = runif(20),
                      c = runif(20),
                      d = runif(20))
    
    
    
    dt2 <- data.table(ID = sample(LETTERS[1:5], 20, replace = TRUE),
                      loc = sample(1:50, 20, replace = TRUE),
                      e = runif(20),
                      f = runif(20),
                      g = runif(20),
                      h = runif(20))
    

    我想这样加入他们 this answer ):

    # Join on ID and nearest rolling join on loc
    dt2[dt1,
        on = .(ID, loc),
        roll = "nearest"]
    
    #     ID loc         e          f         g          h          a          b          c           d
    #  1:  E   2 0.6080648 0.59558616 0.9680243 0.65885155 0.75533475 0.46796072 0.07874670 0.372224933
    #  2:  B  22 0.2900181 0.89395076 0.5012072 0.81403388 0.24129711 0.66914193 0.11941211 0.330982361
    #  3:  C  23 0.7753557 0.31772779 0.3302613 0.02004258 0.32252276 0.09341920 0.29665070 0.563954195
    #  4:  A  46 0.1193827 0.89183103 0.7142606 0.17231293 0.62979589 0.19621242 0.48943734 0.318145133
    #  5:  B  26 0.2900181 0.89395076 0.5012072 0.81403388 0.65672029 0.45106318 0.47421905 0.605327569
    #  6:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.83821385 0.99078941 0.93356571 0.459227328
    #  7:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.86831894 0.41260922 0.65389531 0.930843432
    #  8:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.82042112 0.82906524 0.59829109 0.859362233
    #  9:  D  44 0.3958956 0.06361996 0.8068514 0.56349064 0.29823590 0.04765864 0.65412304 0.742808806
    # 10:  E  11 0.4417452 0.03226111 0.5975499 0.49336668 0.15013055 0.83683385 0.18847332 0.139363770
    # 11:  D  11 0.5967619 0.23497655 0.5429504 0.56322079 0.68644344 0.46995509 0.35128292 0.910443478
    # 12:  A  50 0.1193827 0.89183103 0.7142606 0.17231293 0.65811523 0.48901176 0.96854302 0.875838825
    # 13:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.93484739 0.57810132 0.75250483 0.607710552
    # 14:  A  21 0.4491745 0.61724476 0.3283133 0.51406071 0.96610736 0.03222779 0.05768814 0.436536989
    # 15:  A   6 0.4491745 0.61724476 0.3283133 0.51406071 0.69975907 0.35564120 0.42206040 0.309386788
    # 16:  B  49 0.1152318 0.99716746 0.1440101 0.70734795 0.05138897 0.80463532 0.41856763 0.421029334
    # 17:  C   9 0.1204828 0.47622000 0.6802176 0.36385191 0.98509395 0.49711655 0.68159049 0.003570911
    # 18:  D   7 0.5967619 0.23497655 0.5429504 0.56322079 0.69862668 0.91597522 0.53630369 0.297000037
    # 19:  C   8 0.1204828 0.47622000 0.6802176 0.36385191 0.80761410 0.87051653 0.93177628 0.671692311
    # 20:  B   5 0.5652708 0.50866629 0.3992037 0.87643314 0.69493460 0.99878010 0.77953456 0.820925302
    

    这太棒了。只缺少一件事:两者之间的区别 loc 在里面 dt1 dt2 (即。, delta = abs(x.loc - i.loc) ). 然而,唯一 位置 剩余的来自 ,所以我现在不能做这个计算。

    dt2[dt1,
            on = c("ID", "loc"),
            roll = "nearest", 
            .(ID, loc = i.loc, a, b, c, d, e, f, g, h, delta = abs(x.loc - i.loc))][]
    
    #     ID loc          a          b          c           d         e          f         g          h delta
    #  1:  E   2 0.75533475 0.46796072 0.07874670 0.372224933 0.6080648 0.59558616 0.9680243 0.65885155     1
    #  2:  B  22 0.24129711 0.66914193 0.11941211 0.330982361 0.2900181 0.89395076 0.5012072 0.81403388     5
    #  3:  C  23 0.32252276 0.09341920 0.29665070 0.563954195 0.7753557 0.31772779 0.3302613 0.02004258     6
    #  4:  A  46 0.62979589 0.19621242 0.48943734 0.318145133 0.1193827 0.89183103 0.7142606 0.17231293     0
    #  5:  B  26 0.65672029 0.45106318 0.47421905 0.605327569 0.2900181 0.89395076 0.5012072 0.81403388     1
    #  6:  E  17 0.83821385 0.99078941 0.93356571 0.459227328 0.4417452 0.03226111 0.5975499 0.49336668     2
    #  7:  D  24 0.86831894 0.41260922 0.65389531 0.930843432 0.8974042 0.90725532 0.5008502 0.21681072    14
    #  8:  D  24 0.82042112 0.82906524 0.59829109 0.859362233 0.8974042 0.90725532 0.5008502 0.21681072    14
    #  9:  D  44 0.29823590 0.04765864 0.65412304 0.742808806 0.3958956 0.06361996 0.8068514 0.56349064     1
    # 10:  E  11 0.15013055 0.83683385 0.18847332 0.139363770 0.4417452 0.03226111 0.5975499 0.49336668     4
    # 11:  D  11 0.68644344 0.46995509 0.35128292 0.910443478 0.5967619 0.23497655 0.5429504 0.56322079     8
    # 12:  A  50 0.65811523 0.48901176 0.96854302 0.875838825 0.1193827 0.89183103 0.7142606 0.17231293     4
    # 13:  E  17 0.93484739 0.57810132 0.75250483 0.607710552 0.4417452 0.03226111 0.5975499 0.49336668     2
    # 14:  A  21 0.96610736 0.03222779 0.05768814 0.436536989 0.4491745 0.61724476 0.3283133 0.51406071     4
    # 15:  A   6 0.69975907 0.35564120 0.42206040 0.309386788 0.4491745 0.61724476 0.3283133 0.51406071    19
    # 16:  B  49 0.05138897 0.80463532 0.41856763 0.421029334 0.1152318 0.99716746 0.1440101 0.70734795     6
    # 17:  C   9 0.98509395 0.49711655 0.68159049 0.003570911 0.1204828 0.47622000 0.6802176 0.36385191     3
    # 18:  D   7 0.69862668 0.91597522 0.53630369 0.297000037 0.5967619 0.23497655 0.5429504 0.56322079     4
    # 19:  C   8 0.80761410 0.87051653 0.93177628 0.671692311 0.1204828 0.47622000 0.6802176 0.36385191     2
    # 20:  B   5 0.69493460 0.99878010 0.77953456 0.820925302 0.5652708 0.50866629 0.3992037 0.87643314     1
    

    这是完美的,除了必须命名每一列。因此,作为一种解决方法,我保留 全部的 两个数据表中的列(使用 mget ),然后计算 delta 通过链接:

    # Columns to select
    cols2sel <- c(paste0("x.", names(dt2)), paste0("i.", names(dt1)))
    
    dt2[dt1,
        on = c("ID", "loc"),
        roll = "nearest", 
        mget(cols2sel)][, delta := abs(x.loc - i.loc)][]
    
    #     x.ID x.loc       x.e        x.f       x.g        x.h i.ID i.loc        i.a        i.b        i.c         i.d delta
    #  1:    E     1 0.6080648 0.59558616 0.9680243 0.65885155    E     2 0.75533475 0.46796072 0.07874670 0.372224933     1
    #  2:    B    27 0.2900181 0.89395076 0.5012072 0.81403388    B    22 0.24129711 0.66914193 0.11941211 0.330982361     5
    #  3:    C    29 0.7753557 0.31772779 0.3302613 0.02004258    C    23 0.32252276 0.09341920 0.29665070 0.563954195     6
    #  4:    A    46 0.1193827 0.89183103 0.7142606 0.17231293    A    46 0.62979589 0.19621242 0.48943734 0.318145133     0
    #  5:    B    27 0.2900181 0.89395076 0.5012072 0.81403388    B    26 0.65672029 0.45106318 0.47421905 0.605327569     1
    #  6:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    17 0.83821385 0.99078941 0.93356571 0.459227328     2
    #  7:    D    38 0.8974042 0.90725532 0.5008502 0.21681072    D    24 0.86831894 0.41260922 0.65389531 0.930843432    14
    #  8:    D    38 0.8974042 0.90725532 0.5008502 0.21681072    D    24 0.82042112 0.82906524 0.59829109 0.859362233    14
    #  9:    D    45 0.3958956 0.06361996 0.8068514 0.56349064    D    44 0.29823590 0.04765864 0.65412304 0.742808806     1
    # 10:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    11 0.15013055 0.83683385 0.18847332 0.139363770     4
    # 11:    D     3 0.5967619 0.23497655 0.5429504 0.56322079    D    11 0.68644344 0.46995509 0.35128292 0.910443478     8
    # 12:    A    46 0.1193827 0.89183103 0.7142606 0.17231293    A    50 0.65811523 0.48901176 0.96854302 0.875838825     4
    # 13:    E    15 0.4417452 0.03226111 0.5975499 0.49336668    E    17 0.93484739 0.57810132 0.75250483 0.607710552     2
    # 14:    A    25 0.4491745 0.61724476 0.3283133 0.51406071    A    21 0.96610736 0.03222779 0.05768814 0.436536989     4
    # 15:    A    25 0.4491745 0.61724476 0.3283133 0.51406071    A     6 0.69975907 0.35564120 0.42206040 0.309386788    19
    # 16:    B    43 0.1152318 0.99716746 0.1440101 0.70734795    B    49 0.05138897 0.80463532 0.41856763 0.421029334     6
    # 17:    C     6 0.1204828 0.47622000 0.6802176 0.36385191    C     9 0.98509395 0.49711655 0.68159049 0.003570911     3
    # 18:    D     3 0.5967619 0.23497655 0.5429504 0.56322079    D     7 0.69862668 0.91597522 0.53630369 0.297000037     4
    # 19:    C     6 0.1204828 0.47622000 0.6802176 0.36385191    C     8 0.80761410 0.87051653 0.93177628 0.671692311     2
    # 20:    B     6 0.5652708 0.50866629 0.3992037 0.87643314    B     5 0.69493460 0.99878010 0.77953456 0.820925302     1
    

    这几乎给了我想要的东西,但是现在我不得不纠结于修复列名,删除重复的列(例如。, ID )与Jaap优雅的初始解不同。但是,该解决方案需要命名所有列。

    我的问题是:有没有一种方法既能做到两全其美,又不必指定每一列,同时又能像上面的代码块3那样得到一个干净的格式?

    1 回复  |  直到 7 年前
        1
  •  2
  •   Dan    7 年前

    多亏了 post by sritchie73 他在上面的评论中提供了@Henrik链接。一种解决方案是在联接之前复制联接中使用的变量,以便它们保留在结果中并可用于计算。

    # Copy loc variables
    dt1[, loc1 := loc]
    dt2[, loc2 := loc]
    
    # Perform join, calculate delta, drop loc1 & loc2    
    dt2[dt1,
        on = c("ID", "loc"),
        roll = "nearest"][
          , delta := abs(loc1 - loc2)][
            , c("loc1", "loc2") := NULL][]
    

    #     ID loc         e          f         g          h          a          b          c           d delta
    #  1:  E   2 0.6080648 0.59558616 0.9680243 0.65885155 0.75533475 0.46796072 0.07874670 0.372224933     1
    #  2:  B  22 0.2900181 0.89395076 0.5012072 0.81403388 0.24129711 0.66914193 0.11941211 0.330982361     5
    #  3:  C  23 0.7753557 0.31772779 0.3302613 0.02004258 0.32252276 0.09341920 0.29665070 0.563954195     6
    #  4:  A  46 0.1193827 0.89183103 0.7142606 0.17231293 0.62979589 0.19621242 0.48943734 0.318145133     0
    #  5:  B  26 0.2900181 0.89395076 0.5012072 0.81403388 0.65672029 0.45106318 0.47421905 0.605327569     1
    #  6:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.83821385 0.99078941 0.93356571 0.459227328     2
    #  7:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.86831894 0.41260922 0.65389531 0.930843432    14
    #  8:  D  24 0.8974042 0.90725532 0.5008502 0.21681072 0.82042112 0.82906524 0.59829109 0.859362233    14
    #  9:  D  44 0.3958956 0.06361996 0.8068514 0.56349064 0.29823590 0.04765864 0.65412304 0.742808806     1
    # 10:  E  11 0.4417452 0.03226111 0.5975499 0.49336668 0.15013055 0.83683385 0.18847332 0.139363770     4
    # 11:  D  11 0.5967619 0.23497655 0.5429504 0.56322079 0.68644344 0.46995509 0.35128292 0.910443478     8
    # 12:  A  50 0.1193827 0.89183103 0.7142606 0.17231293 0.65811523 0.48901176 0.96854302 0.875838825     4
    # 13:  E  17 0.4417452 0.03226111 0.5975499 0.49336668 0.93484739 0.57810132 0.75250483 0.607710552     2
    # 14:  A  21 0.4491745 0.61724476 0.3283133 0.51406071 0.96610736 0.03222779 0.05768814 0.436536989     4
    # 15:  A   6 0.4491745 0.61724476 0.3283133 0.51406071 0.69975907 0.35564120 0.42206040 0.309386788    19
    # 16:  B  49 0.1152318 0.99716746 0.1440101 0.70734795 0.05138897 0.80463532 0.41856763 0.421029334     6
    # 17:  C   9 0.1204828 0.47622000 0.6802176 0.36385191 0.98509395 0.49711655 0.68159049 0.003570911     3
    # 18:  D   7 0.5967619 0.23497655 0.5429504 0.56322079 0.69862668 0.91597522 0.53630369 0.297000037     4
    # 19:  C   8 0.1204828 0.47622000 0.6802176 0.36385191 0.80761410 0.87051653 0.93177628 0.671692311     2
    # 20:  B   5 0.5652708 0.50866629 0.3992037 0.87643314 0.69493460 0.99878010 0.77953456 0.820925302     1