代码之家  ›  专栏  ›  技术社区  ›  89_Simple

将dplyr转换为data.table

  •  0
  • 89_Simple  · 技术社区  · 6 年前
    library(dplyr)
    library(data.table)
    
    df <- data.frame(year = rep(2003:2004, each = 4), 
                 mg = rep(rep(c("a", "b"), each = 2), times = 2),
                 plant_date = c(20, 30, 20, 30, 33, 40, 33, 40),
                 stage1 = c(40, 50, 42, 52, 43, 55, 48, 57),
                 stage2 = c(55, 65, 57, 66, 58, 68, 59, 65),
                 stage3 = c(61, 75, 63, 76, 66, 77, 68, 79))
    
      set.seed(123)
      dat <- data.frame(year = rep(2003:2004, each = 365), doy = rep(1:365, times = 2), 
                  rainfall = sample(0:20, 730, replace = T))
    
      final.dat <- dat %>% dplyr::left_join(df)
    

    我使用dplyr进行了如下计算:

      final.dat %>% dplyr::group_by(year, plant_date, mg) %>% 
      dplyr::summarise(
        sum_rain_stage1 = sum(rainfall[doy >= plant_date & doy <= stage1]),
        sum_rain_stage2 = sum(rainfall[doy >= plant_date & doy <= stage2]),
        mean_rain_stage1 = mean(rainfall[doy >= plant_date & doy <= stage1]),
        mean_rain_stage2 = mean(rainfall[doy >= plant_date & doy <= stage2]),
        var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
        var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))
    

    我想用同样的方法 data.table

      final.dat <- data.table(final.dat)
    
      final.dat[, j = list(sum(rainfall[doy >= plant_date & doy <= stage1]), 
                           sum(rainfall[doy >= plant_date & doy <= stage2]),
                           mean(rainfall[doy >= plant_date & doy <= stage1]),
                           mean(rainfall[doy >= plant_date & doy <= stage2])),
                by = list(year, plant_date, mg)]
    

    我有两个问题: 1) 我怎样才能保持我的列名在 dplyr V1 , V2 , V3 V4 dplyr公司 在里面 数据表

      var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
      var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))
    

    3 回复  |  直到 6 年前
        1
  •  3
  •   eddi    6 年前
    dt = as.data.table(final.dat) # or setDT to convert in place
    
    dt[, .(sum_rain_stage1  = sum(rainfall[doy >= plant_date & doy <= stage1]), 
           sum_rain_stage2  = sum(rainfall[doy >= plant_date & doy <= stage2]),
           mean_rain_stage1 = mean(rainfall[doy >= plant_date & doy <= stage1]),
           mean_rain_stage2 = mean(rainfall[doy >= plant_date & doy <= stage2]))
       , by = .(year, plant_date, mg)][
       , `:=`(var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
              var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))][]
    
        2
  •  3
  •   IceCreamToucan    6 年前

    备份一步,你的方法的条件像 doy >= plant_date & doy <= stage1

    doy %between% list(plant_date, stage1) 所以不必输入两次var名称。

    第二,自从 doy 而其他的列来自不同的表,您可以在非equi联接中进行更新 df 而不是做新桌子 final.dat :

    setDT(df)
    setDT(dat)
    df[, c("sum_rs1", "mean_rs1") := 
      dat[df, on=.(doy >= plant_date, doy <= stage1), .(sum(rainfall), mean(rainfall)), by=.EACHI][, .(V1, V2)]]
    df[, c("sum_rs2", "mean_rs2") := 
      dat[df, on=.(doy >= plant_date, doy <= stage2), .(sum(rainfall), mean(rainfall)), by=.EACHI][, .(V1, V2)]]
    
    # same as @eddi's
    df[, `:=`(
      var.x1 = (sum_rs1 > sum_rs2) * (mean_rs1 - mean_rs2),
      var.x2 = (mean_rs1 > mean_rs2) * (sum_rs1 - sum_rs2)
    )]
    
        3
  •  0
  •   akrun    6 年前

    list

    list(a = 1:2, b = 3:5)
    

    同样地,

    final.dat[,  list(sum_rain_stage1 = sum(rainfall[doy >= plant_date & doy <= stage1]),  
                     sum_rain_stage2 =  sum(rainfall[doy >= plant_date & doy <= stage2]),
                     mean_rain_stage1 =        mean(rainfall[doy >= plant_date & doy <= stage1]),
                     mean_rain_stage2 =        mean(rainfall[doy >= plant_date & doy <= stage2])),
            by = list(year, plant_date, mg)]
    # year plant_date mg sum_rain_stage1 sum_rain_stage2 mean_rain_stage1 mean_rain_stage2
    #1: 2003         20  a             251             355        11.952381         9.861111
    #2: 2003         30  a             176             304         8.380952         8.444444
    #3: 2003         20  b             261             361        11.347826         9.500000
    #4: 2003         30  b             185             313         8.043478         8.459459
    #5: 2004         33  a             109             247         9.909091         9.500000
    #6: 2004         40  a             134             279         8.375000         9.620690
    #7: 2004         33  b             157             253         9.812500         9.370370
    #8: 2004         40  b             158             242         8.777778         9.307692