代码之家  ›  专栏  ›  技术社区  ›  Zuzana

聚合和分解组/簇以实现组内相关系数的最大值

  •  0
  • Zuzana  · 技术社区  · 2 年前

    我有一个数据帧,它具有5个类别的分层分类——主要组(1位数)、次要组(2位数)、次主要组(3位数)、单位组(4位数)和代码(7位数)。每一行都是唯一的,都有自己的视图。

    df = structure(list(order_id = c("5015900", "5015903", "5015904", 
    "5015905", "5015906", "5015908", "5015909", "5015910", "5015911", 
    "5015912", "5015913", "5015916", "5015917", "5015918", "5015920", 
    "5015921", "5015922", "5015923", "5015924", "5015925", "5015926", 
    "5015927", "5015928", "5015929", "5015930", "5015931", "5015932", 
    "5015933", "5015935", "5015936", "5015937", "5015938", "5015939", 
    "5015940", "5015941", "5015942", "5015943", "5015950", "5015951", 
    "5015957", "5015958", "5015959", "5015960", "5015961", "5015963", 
    "5015964", "5015965", "5015966", "5015967", "5015968", "5015969", 
    "5015970", "5015971", "5015972", "5015973", "5015979", "5015980", 
    "5015985", "5015991", "5015992", "5015993", "5015994", "5015995", 
    "5016001", "5016008", "5016017", "5016021", "5016022", "5016023", 
    "5016025", "5016026", "5016027", "5016028", "5016029", "5016030", 
    "5016031", "5016032", "5016033", "5016034", "5016036", "5016037", 
    "5016039", "5016041", "5016043", "5016044", "5016046", "5016047", 
    "5016048", "5016049", "5016050", "5016051", "5016052", "5016053", 
    "5016054", "5016055", "5016056", "5016057", "5016058", "5016059", 
    "5016060"), code = c(2212999, 3322000, 2412001, 2412001, 7543999, 
    8219999, 2421003, 4311001, 1330999, 2423003, 8219999, 8219999, 
    8219999, 8344000, 4321001, 5132002, 8344000, 8344000, 8219999, 
    8219999, 7223003, 7212999, 8219999, 8219999, 4321001, 8219999, 
    8219999, 8219999, 4321001, 2512001, 8219999, 2514000, 2412001, 
    5131001, 8219999, 8219999, 8219999, 7212999, 2342001, 7212999, 
    8219999, 8219999, 3433999, 5223999, 3322000, 2144003, 2512001, 
    2423999, 8219999, 5131001, 2521003, 8219999, 2142019, 7411001, 
    8342001, 5244002, 3434000, 4321001, 8332999, 3322000, 8219999, 
    5169999, 2411011, 5223999, 8219999, 8219999, 4211003, 2413999, 
    4211003, 2512001, 7422001, 2511001, 2511001, 2512001, 2522001, 
    2421003, 2512001, 2529001, 2144003, 2431005, 2341002, 8219999, 
    8219999, 8344000, 8219999, 8344000, 8219999, 8344000, 8219999, 
    8219999, 9331000, 5120000, 9621001, 2149020, 9629999, 8219999, 
    5230003, 8219999, 9331000, 4321001), unit_group = c(2212, 3322, 
    2412, 2412, 7543, 8219, 2421, 4311, 1330, 2423, 8219, 8219, 8219, 
    8344, 4321, 5132, 8344, 8344, 8219, 8219, 7223, 7212, 8219, 8219, 
    4321, 8219, 8219, 8219, 4321, 2512, 8219, 2514, 2412, 5131, 8219, 
    8219, 8219, 7212, 2342, 7212, 8219, 8219, 3433, 5223, 3322, 2144, 
    2512, 2423, 8219, 5131, 2521, 8219, 2142, 7411, 8342, 5244, 3434, 
    4321, 8332, 3322, 8219, 5169, 2411, 5223, 8219, 8219, 4211, 2413, 
    4211, 2512, 7422, 2511, 2511, 2512, 2522, 2421, 2512, 2529, 2144, 
    2431, 2341, 8219, 8219, 8344, 8219, 8344, 8219, 8344, 8219, 8219, 
    9331, 5120, 9621, 2149, 9629, 8219, 5230, 8219, 9331, 4321), 
        minor_group = c(221, 332, 241, 241, 754, 821, 242, 431, 133, 
        242, 821, 821, 821, 834, 432, 513, 834, 834, 821, 821, 722, 
        721, 821, 821, 432, 821, 821, 821, 432, 251, 821, 251, 241, 
        513, 821, 821, 821, 721, 234, 721, 821, 821, 343, 522, 332, 
        214, 251, 242, 821, 513, 252, 821, 214, 741, 834, 524, 343, 
        432, 833, 332, 821, 516, 241, 522, 821, 821, 421, 241, 421, 
        251, 742, 251, 251, 251, 252, 242, 251, 252, 214, 243, 234, 
        821, 821, 834, 821, 834, 821, 834, 821, 821, 933, 512, 962, 
        214, 962, 821, 523, 821, 933, 432), submajor_group = c(22, 
        33, 24, 24, 75, 82, 24, 43, 13, 24, 82, 82, 82, 83, 43, 51, 
        83, 83, 82, 82, 72, 72, 82, 82, 43, 82, 82, 82, 43, 25, 82, 
        25, 24, 51, 82, 82, 82, 72, 23, 72, 82, 82, 34, 52, 33, 21, 
        25, 24, 82, 51, 25, 82, 21, 74, 83, 52, 34, 43, 83, 33, 82, 
        51, 24, 52, 82, 82, 42, 24, 42, 25, 74, 25, 25, 25, 25, 24, 
        25, 25, 21, 24, 23, 82, 82, 83, 82, 83, 82, 83, 82, 82, 93, 
        51, 96, 21, 96, 82, 52, 82, 93, 43), major_group = c(2, 3, 
        2, 2, 7, 8, 2, 4, 1, 2, 8, 8, 8, 8, 4, 5, 8, 8, 8, 8, 7, 
        7, 8, 8, 4, 8, 8, 8, 4, 2, 8, 2, 2, 5, 8, 8, 8, 7, 2, 7, 
        8, 8, 3, 5, 3, 2, 2, 2, 8, 5, 2, 8, 2, 7, 8, 5, 3, 4, 8, 
        3, 8, 5, 2, 5, 8, 8, 4, 2, 4, 2, 7, 2, 2, 2, 2, 2, 2, 2, 
        2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 5, 9, 2, 9, 8, 5, 
        8, 9, 4), lnviews = c(5.30330490805908, 5.99645208861902, 
        5.28320372873799, 5.3890717298165, 5.34710753071747, 6.06842558824411, 
        6.0137151560428, 5.64544689764324, 6.08677472691231, 5.97635090929793, 
        3.8286413964891, 5.52942908751142, 4.88280192258637, 4.04305126783455, 
        4.90527477843843, 5.68017260901707, 4.7361984483945, 4.88280192258637, 
        4.53259949315326, 6.57368016696065, 5.43372200355424, 6.02344759296103, 
        4.93447393313069, 3.55534806148941, 7.63964228785801, 4.23410650459726, 
        4.33073334028633, 5.53338948872752, 4.49980967033027, 6.66568371778241, 
        5.48063892334199, 6.49677499018586, 6.25190388316589, 6.85118492749374, 
        3.91202300542815, 5.40717177146012, 4.12713438504509, 3.87120101090789, 
        7.0343879299155, 4.26267987704132, 5.14749447681345, 6.71780469502369, 
        4.93447393313069, 5.29330482472449, 7.07580886397839, 6.16120732169508, 
        5.45958551414416, 6.83087423464618, 4.95582705760126, 6.06145691892802, 
        5.08759633523238, 5.39816270151775, 6.42324696353352, 5.63121178182137, 
        4.82028156560504, 6.24416690066374, 6.27664348934164, 5.93753620508243, 
        7.55903825544338, 6.64639051484773, 6.84481547920826, 6.13556489108174, 
        6.75227037614174, 6.87729607149743, 5.78689738136671, 5.26785815906333, 
        4.85981240436167, 5.13579843705026, 4.94875989037817, 4.89783979995091, 
        5.57215403217776, 4.55387689160054, 4.99043258677874, 4.54329478227, 
        4.40671924726425, 4.44265125649032, 4.95582705760126, 4.24849524204936, 
        5.78382518232974, 6.53813982376767, 5.58724865840025, 5.10594547390058, 
        4.85981240436167, 3.95124371858143, 5.52545293913178, 4.07753744390572, 
        4.69134788222914, 4.49980967033027, 4.29045944114839, 4.14313472639153, 
        7.75790620835175, 5.99396142730657, 6.97447891102505, 6.64378973314767, 
        4.64439089914137, 4.85203026391962, 4.15888308335967, 4.45434729625351, 
        7.80628928926703, 4.45434729625351), count_per_minor_group = c(2089L, 
        10427L, 11348L, 11348L, 3285L, 23165L, 9495L, 772L, 226L, 
        9495L, 23165L, 23165L, 23165L, 3757L, 8504L, 4794L, 3757L, 
        3757L, 23165L, 23165L, 5018L, 2449L, 23165L, 23165L, 8504L, 
        23165L, 23165L, 23165L, 8504L, 25786L, 23165L, 25786L, 11348L, 
        4794L, 23165L, 23165L, 23165L, 2449L, 6651L, 2449L, 23165L, 
        23165L, 878L, 24615L, 10427L, 7534L, 25786L, 9495L, 23165L, 
        4794L, 5026L, 23165L, 7534L, 4778L, 3757L, 2675L, 878L, 8504L, 
        1652L, 10427L, 23165L, 394L, 11348L, 24615L, 23165L, 23165L, 
        1899L, 11348L, 1899L, 25786L, 504L, 25786L, 25786L, 25786L, 
        5026L, 9495L, 25786L, 5026L, 7534L, 5715L, 6651L, 23165L, 
        23165L, 3757L, 23165L, 3757L, 23165L, 3757L, 23165L, 23165L, 
        3580L, 3333L, 5640L, 7534L, 5640L, 23165L, 2229L, 23165L, 
        3580L, 8504L)), row.names = c(NA, -100L), class = c("tbl_df", 
    "tbl", "data.frame"))
    

    我的目标是基于minor_group实现最大的组内相关系数,但minor_ggroup应该至少有1000个观测值,最多有5000个观测值。如果不满足这些条件,那么我想将较小的次要组与最近的次要组聚合(意味着它们共享相同的子主要组),如果次要组仍有5000多个观测值,则根据单位组或代码将最大的次要组划分为较小的组。
    它应该是一种逐步的过程(聚合和分解小群体),以实现基于“更新”的小群体的最高ICC。 如果不能再聚合或拆分组,那么代码应该会中断。

    我设法想出了一些手动的、并不聪明的东西。 有什么建议吗?非常感谢。

    # filter to see minor groups with less than 1000 positions.
    grouped_minor <- df %>% mutate(submajor_or_minor = ifelse(count_per_minor_group < 1000, submajor_group , minor_group ))
    
    # create a new column to count positions in new minor groups (if submajor_or_minor is 2-digit then a new minor group is created) and create a new column that shows the grouped minor_group values as a comma-separated string
    grouped_minor <- grouped_minor %>%
      group_by(submajor_or_minor) %>%
      mutate(count_new_minor = n()) %>%
      mutate(grouped_minors = paste(unique(minor_group), collapse = ", ")) %>%
      ungroup() %>% as.data.frame()
    
    # first ICC
    ICC_minor1 <- lmer(formula = lnviews ~ 1 + (1|grouped_minors),data = grouped_minor) 
    performance::icc(ICC_minor1)
    
    # filter to see unit_group with less than 1000 positions, those will need to be aggregated in a bigger group
    grouped_minor <- grouped_minor %>% mutate(minor_or_unit = ifelse(count_per_unit_group  < 1000, minor_group  , unit_group))
    
    # create a new column to count positions in new unit groups (if minor_or_unit is 3-digit then a new unit group is created) and create a new column that shows the grouped minor_group values as a comma-separated string
    grouped_minor <- grouped_minor %>% 
      group_by(minor_or_unit) %>%
      mutate(count_new_unit = n()) %>% 
      mutate(grouped_unit = paste(unique(unit_group), collapse = ", ")) %>%
      ungroup() %>% as.data.frame()
    
    # second ICC
    ICC_minor2 <- lmer(formula = lnviews ~ 1 + (1|grouped_unit),data = grouped_minor) 
    performance::icc(ICC_minor2)
    
    # filter to see unit groups with more than 5000 positions
    grouped_minor <- grouped_minor %>% mutate(unit_or_code = ifelse(count_per_unit_group  > 5000,   code, grouped_unit  ))
    
    # if unit_or_code is 7-digit then a new code group is created
    grouped_minor <- grouped_minor %>% 
      group_by(unit_or_code) %>%
      mutate(count_new_code= n()) %>% 
      mutate(grouped_code= paste(unique(code), collapse = ", ")) %>%
      ungroup() %>% as.data.frame()
    
    ICC_minor3 <- lmer(formula = lnviews ~ 1 + (1|as.factor(grouped_code)),data = grouped_minor) 
    performance::icc(ICC_minor3)
    
    0 回复  |  直到 2 年前