代码之家  ›  专栏  ›  技术社区  ›  mmyoung77

如何在R/dplyr中最多取几列

  •  0
  • mmyoung77  · 技术社区  · 4 年前

    id <- c(1:5)
    VolumeA <- c(12, NA, NA, NA, NA)
    VolumeB <- c(NA, 34, NA, NA, NA)
    VolumeC <- c(NA, NA, 56, NA, NA)
    VolumeD <- c(NA, NA, NA, 78, NA)
    VolumeE <- c(NA, NA, NA, NA, 90)
    
    df_now <- tibble(id, VolumeA, VolumeB, VolumeC, VolumeD, VolumeE)
    df_now
    
    # A tibble: 5 x 6
         id VolumeA VolumeB VolumeC VolumeD VolumeE
      <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
    1     1      12      NA      NA      NA      NA
    2     2      NA      34      NA      NA      NA
    3     3      NA      NA      56      NA      NA
    4     4      NA      NA      NA      78      NA
    5     5      NA      NA      NA      NA      90
    

    Volume[label] 列,但在每一行中我只需要其中一个:最大的一个。所以我想创建一个值最大的新变量:

    Volume <- c(12, 34, 56, 78, 90)
    df_desired <- cbind(df_now, Volume)
    df_desired
    
      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    1  1      12      NA      NA      NA      NA     12
    2  2      NA      34      NA      NA      NA     34
    3  3      NA      NA      56      NA      NA     56
    4  4      NA      NA      NA      78      NA     78
    5  5      NA      NA      NA      NA      90     90
    

    在看了dplyr文档之后,我尝试了这个。。。

    library(tidyverse)
    df_try <- df_now %>%
      mutate(Volume = across(contains("Volume"), max, na.rm = TRUE))
    

    …但是得到了一堆数据,而不是一列。有人能告诉我怎么做吗?

    gather spread 我的数据。我想用条件句 mutate .)

    1 回复  |  直到 4 年前
        1
  •  2
  •   akrun    4 年前

    pmax (第一次发布 解决方案)。注意,相对改善是非常小的 do.call

    library(dplyr)
    library(purrr)
    df_now %>%
        mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))
    # A tibble: 5 x 7
    #     id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    #1     1      12      NA      NA      NA      NA     12
    #2     2      NA      34      NA      NA      NA     34
    #3     3      NA      NA      56      NA      NA     56
    #4     4      NA      NA      NA      78      NA     78
    #5     5      NA      NA      NA      NA      90     90
    

    或与 c_across max (仅使用 tidyverse 方法)

    df_now %>%
       rowwise %>%
       mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))
    # A tibble: 5 x 7
    # Rowwise: 
    #     id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    #1     1      12      NA      NA      NA      NA     12
    #2     2      NA      34      NA      NA      NA     34
    #3     3      NA      NA      56      NA      NA     56
    #4     4      NA      NA      NA      78      NA     78
    #5     5      NA      NA      NA      NA      90     90
    

    基准

    system.time({df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))})
    #   user  system elapsed 
    #  0.023   0.006   0.029 
    
    system.time({df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))})
    #   user  system elapsed 
    #  0.012   0.002   0.015 
    
    system.time({df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))})
    #   user  system elapsed 
    #  0.011   0.001   0.011 
    

    注意:时间上没有太大的差别

        2
  •  3
  •   r2evans    4 年前

    “还有很多 Volume[label] “列” ,任何对每行有效的解决方案( rowwise )或在每列上单独显示(带 reduce Reduce )会比必要的慢很多。

    df_now %>%
      mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
    # # A tibble: 5 x 7
    #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
    #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
    # 1     1      12      NA      NA      NA      NA     12
    # 2     2      NA      34      NA      NA      NA     34
    # 3     3      NA      NA      56      NA      NA     56
    # 4     4      NA      NA      NA      78      NA     78
    # 5     5      NA      NA      NA      NA      90     90
    

    相对改善的证明:

    1. 使用 减少 purrr::reduce 或者任何会对每列进行迭代的内容(好吧,使用 nc nc-1 次数):

      mypmax <- function(...) { message("mypmax"); pmax(...); }
      df_now %>%
        mutate(Volume = reduce(select(., starts_with('Volume')), mypmax, na.rm = TRUE))
      # mypmax
      # mypmax
      # mypmax
      # mypmax
      # # A tibble: 5 x 7
      #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
      #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
      # 1     1      12      NA      NA      NA      NA     12
      # 2     2      NA      34      NA      NA      NA     34
      # 3     3      NA      NA      56      NA      NA     56
      # 4     4      NA      NA      NA      78      NA     78
      # 5     5      NA      NA      NA      NA      90     90
      
    2. 有什么事吗 吵闹的 每行执行一次,可能更糟(假设数据中的行多于列:

      mymax <- function(...) { message("mymax"); max(...); }
      df_now %>%
        rowwise %>%
        mutate(Volume = mymax(c_across(starts_with('Volume')), na.rm = TRUE))
      # mymax
      # mymax
      # mymax
      # mymax
      # mymax
      # # A tibble: 5 x 7
      # # Rowwise: 
      #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
      #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
      # 1     1      12      NA      NA      NA      NA     12
      # 2     2      NA      34      NA      NA      NA     34
      # 3     3      NA      NA      56      NA      NA     56
      # 4     4      NA      NA      NA      78      NA     78
      # 5     5      NA      NA      NA      NA      90     90
      
    3. 去做吧 一旦

      mypmax <- function(...) { message("mypmax"); pmax(...); }
      df_now %>%
        mutate(Volume = do.call(mypmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
      # mypmax
      # # A tibble: 5 x 7
      #      id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
      #   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
      # 1     1      12      NA      NA      NA      NA     12
      # 2     2      NA      34      NA      NA      NA     34
      # 3     3      NA      NA      56      NA      NA     56
      # 4     4      NA      NA      NA      78      NA     78
      # 5     5      NA      NA      NA      NA      90     90
      

    在这种规模下,基准比较小,但在更大的数据下,基准比较引人注目:

    microbenchmark::microbenchmark(
      red = df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE)),
      row = df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE)),
      sgl = df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
    )
    # Unit: milliseconds
    #  expr    min      lq     mean  median      uq     max neval
    #   red 4.9736 5.36240 7.240561 5.68010 6.19915 70.7482   100
    #   row 4.5813 5.02020 6.082047 5.34460 5.70345 63.1166   100
    #   sgl 3.8270 4.18605 5.803043 4.43215 4.76030 65.7217   100