代码之家  ›  专栏  ›  技术社区  ›  Mark Davies

从变量z获取数据,其中变量x=变量y

  •  0
  • Mark Davies  · 技术社区  · 3 年前
    library(dplyr)
    df = data.frame(group_id = c(rep("a",5), rep("b",5)),
                    prod_id = 1:10,
                    prod_type = rep(c("a","a", "b", "c","d"),2),
                    start = lubridate::dmy(c("01/01/2001", "02/02/2002", "03/03/2003", "04/04/2004", "05/05/2005")),
                    fin = lubridate::dmy(c(NA,NA,NA,NA,NA)))%>%
      group_by(group_id) %>%
      mutate(next_acd = if_else(lead(prod_type) %in% c("a","c","d"), lead(prod_id), prod_id[NA])) %>%
      tidyr::fill(next_acd, .direction = "up")
    
    df
    group_id prod_id prod_type start      fin    next_acd
       <chr>      <int> <chr>     <date>     <date>    <int>
     1 a              1 a         2001-01-01 NA            2
     2 a              2 a         2002-02-02 NA            4
     3 a              3 b         2003-03-03 NA            4
     4 a              4 c         2004-04-04 NA            5
     5 a              5 d         2005-05-05 NA           NA
     6 b              6 a         2001-01-01 NA            7
     7 b              7 a         2002-02-02 NA            9
     8 b              8 b         2003-03-03 NA            9
     9 b              9 c         2004-04-04 NA           10
    10 b             10 d         2005-05-05 NA           NA
    

    在上面 next_acd 是下一个 prod_id 就是 prod_type “a”、“c”或“d”。

    我现在想 mutate fin 成为 start 日期 prod_id next_acd 。结果应该是:

     group_id prod_id prod_type start      fin        next_acd
       <chr>      <int> <chr>     <date>     <date>        <int>
     1 a              1 a         2001-01-01 2002-02-02        2
     2 a              2 a         2002-02-02 2004-04-04        4
     3 a              3 b         2003-03-03 2004-04-04        4
     4 a              4 c         2004-04-04 2005-05-05        5
     5 a              5 d         2005-05-05 NA               NA
     6 b              6 a         2001-01-01 2002-02-02        7
     7 b              7 a         2002-02-02 2004-04-04        9
     8 b              8 b         2003-03-03 2004-04-04        9
     9 b              9 c         2004-04-04 2005-05-05       10
    10 b             10 d         2005-05-05 NA               NA
    

    编辑 原始答案适用于示例数据,其中 prod_id 按数字顺序排列,没有任何遗漏。它在我的真实数据中不起作用,更像是:

    df = data.frame(group_id = c("G1","G1", rep("G2",8)),
                    prod_id = c(1,2,5,6,7,8,9,10,11,12),
                    prod_type = rep(c("a","a", "b", "c","d"),2),
                    start = lubridate::dmy(c("01/01/2001", "02/02/2002",
                                             "05/05/2005","06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009","01/01/2010", "02/02/2011", "03/03/2012" )),
                    fin = lubridate::dmy(NA))%>%
      group_by(group_id) %>%
      mutate(next_acd = if_else(lead(prod_type) %in% c("a","c","d"), lead(prod_id), prod_id[NA])) %>%
      tidyr::fill(next_acd, .direction = "up")%>%
      ungroup()
    

    我期待的结果是:

    group_id prod_id prod_type start      fin        next_acd
       <chr>      <dbl> <chr>     <date>     <date>        <dbl>
     1 G1             1 a         2001-01-01 2002-02-02        2
     2 G1             2 a         2002-02-02 NA               NA
     3 G2             5 b         2005-05-05 2006-06-06        6
     4 G2             6 c         2006-06-06 2007-07-07        7
     5 G2             7 d         2007-07-07 2008-08-08        8
     6 G2             8 a         2008-08-08 2009-09-09        9
     7 G2             9 a         2009-09-09 2011-02-02       11
     8 G2            10 b         2010-01-01 2011-02-02       11
     9 G2            11 c         2011-02-02 2012-03-03       12
    10 G2            12 d         2012-03-03 NA               NA
    

    但是尝试 df$fin <- df$start[df$prod_id[df$next_acd]] ,我得到:

    df
    # A tibble: 10 x 6
       group_id prod_id prod_type start      fin        next_acd
       <chr>      <dbl> <chr>     <date>     <date>        <dbl>
     1 G1             1 a         2001-01-01 2002-02-02        2
     2 G1             2 a         2002-02-02 NA               NA
     3 G2             5 b         2005-05-05 2010-01-01        6
     4 G2             6 c         2006-06-06 2011-02-02        7
     5 G2             7 d         2007-07-07 2012-03-03        8
     6 G2             8 a         2008-08-08 NA                9
     7 G2             9 a         2009-09-09 NA               11
     8 G2            10 b         2010-01-01 NA               11
     9 G2            11 c         2011-02-02 NA               12
    10 G2            12 d         2012-03-03 NA               NA
    

    我不明白这里发生了什么,也不知道如何解决。

    1 回复  |  直到 3 年前
        1
  •  1
  •   PaulS    3 年前

    一个可能的解决方案,在 base R :

    df$fin <- df$start[match(df$next_acd, df$prod_id)]
    
    #> # A tibble: 10 × 6
    #>    group_id prod_id prod_type start      fin        next_acd
    #>    <chr>      <dbl> <chr>     <date>     <date>        <dbl>
    #>  1 G1             1 a         2001-01-01 2002-02-02        2
    #>  2 G1             2 a         2002-02-02 NA               NA
    #>  3 G2             5 b         2005-05-05 2006-06-06        6
    #>  4 G2             6 c         2006-06-06 2007-07-07        7
    #>  5 G2             7 d         2007-07-07 2008-08-08        8
    #>  6 G2             8 a         2008-08-08 2009-09-09        9
    #>  7 G2             9 a         2009-09-09 2011-02-02       11
    #>  8 G2            10 b         2010-01-01 2011-02-02       11
    #>  9 G2            11 c         2011-02-02 2012-03-03       12
    #> 10 G2            12 d         2012-03-03 NA               NA
    

    或使用 dplyr :

    library(tidyverse)
    
    df %>% 
      mutate(fin = start[match(next_acd, prod_id)])
    
    #> # A tibble: 10 × 6
    #>    group_id prod_id prod_type start      fin        next_acd
    #>    <chr>      <dbl> <chr>     <date>     <date>        <dbl>
    #>  1 G1             1 a         2001-01-01 2002-02-02        2
    #>  2 G1             2 a         2002-02-02 NA               NA
    #>  3 G2             5 b         2005-05-05 2006-06-06        6
    #>  4 G2             6 c         2006-06-06 2007-07-07        7
    #>  5 G2             7 d         2007-07-07 2008-08-08        8
    #>  6 G2             8 a         2008-08-08 2009-09-09        9
    #>  7 G2             9 a         2009-09-09 2011-02-02       11
    #>  8 G2            10 b         2010-01-01 2011-02-02       11
    #>  9 G2            11 c         2011-02-02 2012-03-03       12
    #> 10 G2            12 d         2012-03-03 NA               NA