代码之家  ›  专栏  ›  技术社区  ›  Mus mzuba

如何按两个字段分组,并按出现日期统计每种类型的数据?

r
  •  1
  • Mus mzuba  · 技术社区  · 6 年前

    structure(list(department = structure(c(21L, 14L, 4L, 11L, 21L, 
    12L, 15L, 11L, 3L, 18L, 4L, 20L, 25L, 3L, 3L, 13L, 19L, 22L, 
    18L, 16L, 16L, 16L, 16L, 4L, 20L, 12L, 4L, 27L, 1L, 6L, 16L, 
    1L, 13L, 13L, 25L, 18L, 8L, 23L, 10L, 16L, 4L, 21L, 2L, 5L, 18L, 
    10L, 23L, 4L, 7L, 5L, 14L, 15L, 19L, 23L, 11L, 4L, 15L, 6L, 12L, 
    11L, 23L, 14L, 15L, 11L, 18L, 24L, 27L, 27L, 20L, 5L, 1L, 19L, 
    4L, 10L, 4L, 26L, 3L, 14L, 15L, 12L, 22L, 14L, 20L, 25L, 2L, 
    23L, 15L, 13L, 4L, 18L, 26L, 13L, 5L, 10L, 1L, 6L, 10L, 22L, 
    5L, 14L), .Label = c("Beauty", "Boutique advisor", "Boutique advisors", 
    "Boutique Stylist", "Clean Beauty Expert", "Conseiller en boutique", 
    "Design Consultant", "Designer Trade Specialist", "Food", "Furniture", 
    "In-store Design Expert", "In-store experts", "In-Store Sales Professional", 
    "In-Store Style Experts", "John Hardy", "Jos. A. Bank LIVE!", 
    "Levi's Stylists", "Lighting & Home Accessories", "Men's Wearhouse LIVE!", 
    "Menswear", "Personal advisors", "Styliste en boutique", "Vendeurs", 
    "Wine", "Women's Accessories", "Women's shoes", "Womenswear"), class = "factor"), 
        type = c("Completed", "Missed", "Missed", "Missed", "Missed", 
        "Missed", "Missed", "Completed", "Completed", "Missed", "Missed", 
        "Completed", "Completed", "Completed", "Completed", "Completed", 
        "Completed", "Completed", "Completed", "Missed", "Completed", 
        "Missed", "Completed", "Missed", "Missed", "Completed", "Missed", 
        "Missed", "Missed", "Completed", "Missed", "Completed", "Missed", 
        "Completed", "Missed", "Missed", "Completed", "Missed", "Missed", 
        "Completed", "Completed", "Missed", "Completed", "Missed", 
        "Completed", "Missed", "Missed", "Completed", "Missed", "Completed", 
        "Completed", "Missed", "Completed", "Missed", "Completed", 
        "Completed", "Missed", "Missed", "Missed", "Missed", "Completed", 
        "Missed", "Completed", "Completed", "Completed", "Missed", 
        "Missed", "Completed", "Missed", "Completed", "Completed", 
        "Missed", "Completed", "Completed", "Missed", "Missed", "Completed", 
        "Completed", "Completed", "Completed", "Missed", "Completed", 
        "Completed", "Completed", "Completed", "Completed", "Completed", 
        "Completed", "Completed", "Completed", "Completed", "Missed", 
        "Missed", "Completed", "Completed", "Completed", "Missed", 
        "Completed", "Missed", "Completed"), date = structure(c(17889, 
        17890, 17893, 17893, 17892, 17892, 17893, 17893, 17892, 17888, 
        17892, 17889, 17888, 17893, 17888, 17889, 17891, 17892, 17893, 
        17891, 17889, 17888, 17892, 17889, 17889, 17892, 17888, 17889, 
        17893, 17892, 17893, 17892, 17891, 17893, 17888, 17891, 17892, 
        17891, 17892, 17888, 17891, 17893, 17893, 17892, 17890, 17888, 
        17888, 17889, 17891, 17893, 17893, 17890, 17890, 17892, 17889, 
        17892, 17889, 17889, 17888, 17888, 17893, 17893, 17893, 17891, 
        17888, 17892, 17892, 17893, 17891, 17888, 17889, 17891, 17889, 
        17890, 17891, 17888, 17889, 17888, 17890, 17893, 17889, 17889, 
        17893, 17889, 17892, 17891, 17889, 17892, 17888, 17891, 17893, 
        17890, 17890, 17889, 17893, 17889, 17889, 17888, 17889, 17892
        ), class = "Date"), count = c(7L, 9L, 8L, 3L, 5L, 4L, 5L, 
        10L, 1L, 3L, 5L, 18L, 3L, 7L, 1L, 17L, 277L, 10L, 14L, 50L, 
        520L, 92L, 791L, 6L, 7L, 4L, 2L, 1L, 3L, 3L, 145L, 17L, 10L, 
        42L, 1L, 1L, 1L, 2L, 7L, 627L, 3L, 6L, 4L, 3L, 3L, 2L, 1L, 
        2L, 1L, 20L, 41L, 4L, 283L, 1L, 14L, 5L, 2L, 1L, 3L, 3L, 
        7L, 12L, 36L, 9L, 14L, 1L, 6L, 13L, 1L, 14L, 12L, 16L, 3L, 
        2L, 6L, 7L, 4L, 21L, 3L, 5L, 5L, 22L, 12L, 5L, 1L, 5L, 23L, 
        36L, 13L, 12L, 12L, 9L, 4L, 6L, 6L, 4L, 1L, 4L, 1L, 32L)), row.names = c(NA, 
    100L), class = "data.frame")
    

    enter image description here

    目前,我有两种方法来解决这一问题,这两种方法都不能产生预期的结果,但我怀疑我已经接近了,因为解决方案似乎介于两者之间。

    第一种方法:

    library(dplyr) # For the purpose of this reproducible example should you need it
    
    dept %>%
    group_by(
        department
      ) %>% 
      summarise(
        missed = sum(type == "Missed"),
        completed = sum(type == "Completed"),
        missed_pct = missed / (missed + completed)
      )
    

    这就给了我:

    # A tibble: 7 x 4
      department          missed completed missed_pct
      <fct>                <int>     <int>      <dbl>
    1 Beauty                   2         5      0.286
    2 Food                     0         1      0    
    3 Menswear                 4         6      0.4  
    4 Wine                     1         1      0.5  
    5 Women's Accessories      2         5      0.286
    6 Women's shoes            3         5      0.375
    7 Womenswear               4         5      0.444
    

    第二种方法:

    library(dplyr) # For the purpose of this reproducible example should you need it
    
    dept %>%
      group_by(
        department,
        date
      ) %>% 
      summarise(
        missed = sum(type == "Missed"),
        completed = sum(type == "Completed"),
        missed_pct = missed / (missed + completed)
      )
    

    这就给了我:

    # A tibble: 28 x 5
    # Groups:   department [?]
       department date       missed completed missed_pct
       <fct>      <date>      <int>     <int>      <dbl>
     1 Beauty     2018-12-23      0         1        0  
     2 Beauty     2018-12-24      0         1        0  
     3 Beauty     2018-12-26      0         1        0  
     4 Beauty     2018-12-27      1         1        0.5
     5 Beauty     2018-12-28      1         1        0.5
     6 Food       2018-12-27      0         1        0  
     7 Menswear   2018-12-23      1         1        0.5
     8 Menswear   2018-12-24      1         1        0.5
     9 Menswear   2018-12-25      0         1        0  
    10 Menswear   2018-12-26      1         1        0.5
    

    我该怎么做?

    2 回复  |  直到 6 年前
        1
  •  1
  •   Tom Haddow    6 年前

    使用 dplyr

    library(dplyr)
    dept %>%
      group_by(department, date, type) %>% 
      summarise(count = sum(count, na.rm = T)) %>%   # data had a few duplicate rows
      spread(type, count, fill = 0) %>% 
      as.data.frame() %>% 
      group_by(department, date) %>% 
      mutate(missed_pct = Missed / (Missed + Completed)) %>%
      melt(id.vars = c("department", "date")) %>% 
      spread(date, value) %>% 
      rename(type = variable)
    

    如果你想解释这些,请告诉我。基本上只是多次使用spread和melt,以确保数据在每个阶段都按我们所需的方式进行结构化(我建议运行到每个melt/spread,并注意data.frame是如何形成的。)

        2
  •  1
  •   anotherfred    6 年前

    您需要将数据从长格式改为宽格式,而不是分组。这就是所谓的“铸造”。

    library(reshape2)
    dcast(dept, department + type ~ date, fun.aggregate = sum)
    

    其中:

             department      type 2018-12-23 2018-12-24 2018-12-25 2018-12-26 2018-12-27 2018-12-28
    1            Beauty Completed          0         12          0          0         17          6
    2            Beauty    Missed          0          0          0          0          0          3
    3  Boutique advisor Completed          0          0          0          0          1          4
    4 Boutique advisors Completed          1          4          0          0          1          7
    5  Boutique Stylist Completed         13          5          0          3          5          0
    6  Boutique Stylist    Missed          2          6          0          6          5          8
    

    您的图像还显示了%行。你需要这个吗?

    编辑:要添加百分比行,请在重塑形状之前计算它们:

    dept %>%
    # create the percentage rows by grouping by department/date/type. Later we will combine these rows back with the original data
    group_by(department, date, type) %>%
    # add a column n with the sum of count in each group
    summarise(n=sum(count)) %>%
    # do 2 separate things:
    #   - add a percent column
    #   - change all the values in the type column to have a % at the end so they don't get mixed up with the original values later
    mutate(percent = n * 100 / sum(n), type = paste(type, "%")) %>%
    # remove all rows except the percent ones
    filter(type == "Missed %") %>%
    # remove the temporary 'n' column we created earlier, and rename the percent column to 'count' so it can go through the 'dcast' function later without any problems
    select(department, type, count = percent, "date") %>%
    # append with the original data
    bind_rows(dept) %>%
    # cast the data with the date column used as columns
    # and fill it with the sum of the 'count' column
    # the percentage rows we created earlier will pass through the function unharmed as there is only one of them in each department/type/date
    dcast(department + type ~ date, fun.aggregate = sum, value.var = "count")
    

             department      type 2018-12-23 2018-12-24 2018-12-25 2018-12-26 2018-12-27 2018-12-28
    1            Beauty Completed          0         12          0          0         17    6.00000
    2            Beauty    Missed          0          0          0          0          0    3.00000
    3            Beauty  Missed %          0          0          0          0          0   33.33333
    4  Boutique advisor Completed          0          0          0          0          1    4.00000
    5 Boutique advisors Completed          1          4          0          0          1    7.00000
    6  Boutique Stylist Completed         13          5          0          3          5    0.00000