代码之家  ›  专栏  ›  技术社区  ›  Ashtray K

使用2个表创建新功能

  •  2
  • Ashtray K  · 技术社区  · 8 年前
    table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                     product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                     first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                     last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
    table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                     order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                     days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
    

    我想使用table2向table1添加新功能。 新特性是每个用户对每个产品的订购间隔。

    例如,让我们看看表1。第一行有(user\u id==1)、(product\u id==14)、(First order==1)和(last order==4)。这意味着产品14是按订单1、2、3、4订购的。我们可以在表2中找到这个订单号。

    我想可以用join完成,但是 因为每个桌子其实都很大。

    我在for循环中使用下面的函数:

    f <- function(i){
      a <- table2 %>% 
        filter(user_id==table1[i, 1]) %>% 
        filter(order_number==table1[i, 3] | order_number==table1[i, 4])
    
      ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
    }
    

    有更好的代码吗?我在等你的帮助。

    2 回复  |  直到 8 年前
        1
  •  3
  •   Aramis7d    8 年前

    你能分享运行时/计算时间比较吗 join

    下面是一个使用连接的解决方案。

    library(tidyverse)
    
    df1 <- as.data.frame(table1)
    df2 <- as.data.frame(table2)
    
    
    df1 %>%
      left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
      rename(dayMin = days_cumsum) %>%
      left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
      rename(dayMax = days_cumsum) %>%
      mutate(newVar = dayMax-dayMin) %>%
      select(user_id, product_id, first_order, last_order, newVar)
    

    其中给出:

       user_id product_id first_order last_order newVar
         <dbl>      <dbl>       <dbl>      <dbl>  <dbl>
     1       1         14           1          4     26
     2       1         24           2          7     46
     3       1         38           1          5     34
     4       1         40           4          8     33
     5       1         66           5          8     25
     6       1          2           3          3      0
     7       1         19           2          4     19
     8       1         30           4          7     27
     9       1         71           2          5     27
    10       1         98           4          9     40
    11       2          7           2          4     13
    12       2         16           3          5     12
    
        2
  •  2
  •   cderv    8 年前

    为了进行比较,一些解决方案使用 data.table .

    table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                         product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                         first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                         last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
    table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                         order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                         days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
    
    library(data.table)
    
    setDT(table1)
    setDT(table2)
    
    table1
    #>     user_id product_id first_order last_order
    #>  1:       1         14           1          4
    #>  2:       1         24           2          7
    #>  3:       1         38           1          5
    #>  4:       1         40           4          8
    #>  5:       1         66           5          8
    #>  6:       1          2           3          3
    #>  7:       1         19           2          4
    #>  8:       1         30           4          7
    #>  9:       1         71           2          5
    #> 10:       1         98           4          9
    #> 11:       2          7           2          4
    #> 12:       2         16           3          5
    table2
    #>     user_id order_number days_cumsum
    #>  1:       1            1           0
    #>  2:       1            2           7
    #>  3:       1            3          15
    #>  4:       1            4          26
    #>  5:       1            5          34
    #>  6:       1            6          43
    #>  7:       1            7          53
    #>  8:       1            8          59
    #>  9:       1            9          66
    #> 10:       1           10          74
    #> 11:       1           11          82
    #> 12:       1           12          91
    #> 13:       2            1           5
    #> 14:       2            2          11
    #> 15:       2            3          17
    #> 16:       2            4          24
    #> 17:       2            5          29
    #> 18:       2            6          35
    
    DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
    setnames(DayMin, "days_cumsum", "dayMin")
    DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
    setnames(DayMax, "days_cumsum", "dayMax")
    res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
    # calculate diff and delete column
    res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
    res[]
    #>     user_id product_id first_order last_order diff
    #>  1:       1          2           3          3    0
    #>  2:       1         14           1          4   26
    #>  3:       1         19           2          4   19
    #>  4:       1         38           1          5   34
    #>  5:       1         71           2          5   27
    #>  6:       1         24           2          7   46
    #>  7:       1         30           4          7   27
    #>  8:       1         40           4          8   33
    #>  9:       1         66           5          8   25
    #> 10:       1         98           4          9   40
    #> 11:       2          7           2          4   13
    #> 12:       2         16           3          5   12
    

    “管道式”版本,无需重命名

    table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
        table2, on = .(user_id , last_order = order_number), nomatch = 0][
          , `:=`(
            diff = i.days_cumsum - days_cumsum, 
            days_cumsum = NULL, 
            i.days_cumsum = NULL
          )][]
    #>     user_id product_id first_order last_order diff
    #>  1:       1          2           3          3    0
    #>  2:       1         14           1          4   26
    #>  3:       1         19           2          4   19
    #>  4:       1         38           1          5   34
    #>  5:       1         71           2          5   27
    #>  6:       1         24           2          7   46
    #>  7:       1         30           4          7   27
    #>  8:       1         40           4          8   33
    #>  9:       1         66           5          8   25
    #> 10:       1         98           4          9   40
    #> 11:       2          7           2          4   13
    #> 12:       2         16           3          5   12
    

    仅对一个合并使用整形

    tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
    res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
    setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
    res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
    res
    #>     user_id product_id first_order last_order diff
    #>  1:       1          2           3          3    0
    #>  2:       1         14           1          4   26
    #>  3:       1         19           2          4   19
    #>  4:       1         24           2          7   46
    #>  5:       1         30           4          7   27
    #>  6:       1         38           1          5   34
    #>  7:       1         40           4          8   33
    #>  8:       1         66           5          8   25
    #>  9:       1         71           2          5   27
    #> 10:       1         98           4          9   40
    #> 11:       2          7           2          4   13
    #> 12:       2         16           3          5   12