我有一个数据帧,它具有5个类别的分层分类——主要组(1位数)、次要组(2位数)、次主要组(3位数)、单位组(4位数)和代码(7位数)。每一行都是唯一的,都有自己的视图。
df = structure(list(order_id = c("5015900", "5015903", "5015904",
"5015905", "5015906", "5015908", "5015909", "5015910", "5015911",
"5015912", "5015913", "5015916", "5015917", "5015918", "5015920",
"5015921", "5015922", "5015923", "5015924", "5015925", "5015926",
"5015927", "5015928", "5015929", "5015930", "5015931", "5015932",
"5015933", "5015935", "5015936", "5015937", "5015938", "5015939",
"5015940", "5015941", "5015942", "5015943", "5015950", "5015951",
"5015957", "5015958", "5015959", "5015960", "5015961", "5015963",
"5015964", "5015965", "5015966", "5015967", "5015968", "5015969",
"5015970", "5015971", "5015972", "5015973", "5015979", "5015980",
"5015985", "5015991", "5015992", "5015993", "5015994", "5015995",
"5016001", "5016008", "5016017", "5016021", "5016022", "5016023",
"5016025", "5016026", "5016027", "5016028", "5016029", "5016030",
"5016031", "5016032", "5016033", "5016034", "5016036", "5016037",
"5016039", "5016041", "5016043", "5016044", "5016046", "5016047",
"5016048", "5016049", "5016050", "5016051", "5016052", "5016053",
"5016054", "5016055", "5016056", "5016057", "5016058", "5016059",
"5016060"), code = c(2212999, 3322000, 2412001, 2412001, 7543999,
8219999, 2421003, 4311001, 1330999, 2423003, 8219999, 8219999,
8219999, 8344000, 4321001, 5132002, 8344000, 8344000, 8219999,
8219999, 7223003, 7212999, 8219999, 8219999, 4321001, 8219999,
8219999, 8219999, 4321001, 2512001, 8219999, 2514000, 2412001,
5131001, 8219999, 8219999, 8219999, 7212999, 2342001, 7212999,
8219999, 8219999, 3433999, 5223999, 3322000, 2144003, 2512001,
2423999, 8219999, 5131001, 2521003, 8219999, 2142019, 7411001,
8342001, 5244002, 3434000, 4321001, 8332999, 3322000, 8219999,
5169999, 2411011, 5223999, 8219999, 8219999, 4211003, 2413999,
4211003, 2512001, 7422001, 2511001, 2511001, 2512001, 2522001,
2421003, 2512001, 2529001, 2144003, 2431005, 2341002, 8219999,
8219999, 8344000, 8219999, 8344000, 8219999, 8344000, 8219999,
8219999, 9331000, 5120000, 9621001, 2149020, 9629999, 8219999,
5230003, 8219999, 9331000, 4321001), unit_group = c(2212, 3322,
2412, 2412, 7543, 8219, 2421, 4311, 1330, 2423, 8219, 8219, 8219,
8344, 4321, 5132, 8344, 8344, 8219, 8219, 7223, 7212, 8219, 8219,
4321, 8219, 8219, 8219, 4321, 2512, 8219, 2514, 2412, 5131, 8219,
8219, 8219, 7212, 2342, 7212, 8219, 8219, 3433, 5223, 3322, 2144,
2512, 2423, 8219, 5131, 2521, 8219, 2142, 7411, 8342, 5244, 3434,
4321, 8332, 3322, 8219, 5169, 2411, 5223, 8219, 8219, 4211, 2413,
4211, 2512, 7422, 2511, 2511, 2512, 2522, 2421, 2512, 2529, 2144,
2431, 2341, 8219, 8219, 8344, 8219, 8344, 8219, 8344, 8219, 8219,
9331, 5120, 9621, 2149, 9629, 8219, 5230, 8219, 9331, 4321),
minor_group = c(221, 332, 241, 241, 754, 821, 242, 431, 133,
242, 821, 821, 821, 834, 432, 513, 834, 834, 821, 821, 722,
721, 821, 821, 432, 821, 821, 821, 432, 251, 821, 251, 241,
513, 821, 821, 821, 721, 234, 721, 821, 821, 343, 522, 332,
214, 251, 242, 821, 513, 252, 821, 214, 741, 834, 524, 343,
432, 833, 332, 821, 516, 241, 522, 821, 821, 421, 241, 421,
251, 742, 251, 251, 251, 252, 242, 251, 252, 214, 243, 234,
821, 821, 834, 821, 834, 821, 834, 821, 821, 933, 512, 962,
214, 962, 821, 523, 821, 933, 432), submajor_group = c(22,
33, 24, 24, 75, 82, 24, 43, 13, 24, 82, 82, 82, 83, 43, 51,
83, 83, 82, 82, 72, 72, 82, 82, 43, 82, 82, 82, 43, 25, 82,
25, 24, 51, 82, 82, 82, 72, 23, 72, 82, 82, 34, 52, 33, 21,
25, 24, 82, 51, 25, 82, 21, 74, 83, 52, 34, 43, 83, 33, 82,
51, 24, 52, 82, 82, 42, 24, 42, 25, 74, 25, 25, 25, 25, 24,
25, 25, 21, 24, 23, 82, 82, 83, 82, 83, 82, 83, 82, 82, 93,
51, 96, 21, 96, 82, 52, 82, 93, 43), major_group = c(2, 3,
2, 2, 7, 8, 2, 4, 1, 2, 8, 8, 8, 8, 4, 5, 8, 8, 8, 8, 7,
7, 8, 8, 4, 8, 8, 8, 4, 2, 8, 2, 2, 5, 8, 8, 8, 7, 2, 7,
8, 8, 3, 5, 3, 2, 2, 2, 8, 5, 2, 8, 2, 7, 8, 5, 3, 4, 8,
3, 8, 5, 2, 5, 8, 8, 4, 2, 4, 2, 7, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 5, 9, 2, 9, 8, 5,
8, 9, 4), lnviews = c(5.30330490805908, 5.99645208861902,
5.28320372873799, 5.3890717298165, 5.34710753071747, 6.06842558824411,
6.0137151560428, 5.64544689764324, 6.08677472691231, 5.97635090929793,
3.8286413964891, 5.52942908751142, 4.88280192258637, 4.04305126783455,
4.90527477843843, 5.68017260901707, 4.7361984483945, 4.88280192258637,
4.53259949315326, 6.57368016696065, 5.43372200355424, 6.02344759296103,
4.93447393313069, 3.55534806148941, 7.63964228785801, 4.23410650459726,
4.33073334028633, 5.53338948872752, 4.49980967033027, 6.66568371778241,
5.48063892334199, 6.49677499018586, 6.25190388316589, 6.85118492749374,
3.91202300542815, 5.40717177146012, 4.12713438504509, 3.87120101090789,
7.0343879299155, 4.26267987704132, 5.14749447681345, 6.71780469502369,
4.93447393313069, 5.29330482472449, 7.07580886397839, 6.16120732169508,
5.45958551414416, 6.83087423464618, 4.95582705760126, 6.06145691892802,
5.08759633523238, 5.39816270151775, 6.42324696353352, 5.63121178182137,
4.82028156560504, 6.24416690066374, 6.27664348934164, 5.93753620508243,
7.55903825544338, 6.64639051484773, 6.84481547920826, 6.13556489108174,
6.75227037614174, 6.87729607149743, 5.78689738136671, 5.26785815906333,
4.85981240436167, 5.13579843705026, 4.94875989037817, 4.89783979995091,
5.57215403217776, 4.55387689160054, 4.99043258677874, 4.54329478227,
4.40671924726425, 4.44265125649032, 4.95582705760126, 4.24849524204936,
5.78382518232974, 6.53813982376767, 5.58724865840025, 5.10594547390058,
4.85981240436167, 3.95124371858143, 5.52545293913178, 4.07753744390572,
4.69134788222914, 4.49980967033027, 4.29045944114839, 4.14313472639153,
7.75790620835175, 5.99396142730657, 6.97447891102505, 6.64378973314767,
4.64439089914137, 4.85203026391962, 4.15888308335967, 4.45434729625351,
7.80628928926703, 4.45434729625351), count_per_minor_group = c(2089L,
10427L, 11348L, 11348L, 3285L, 23165L, 9495L, 772L, 226L,
9495L, 23165L, 23165L, 23165L, 3757L, 8504L, 4794L, 3757L,
3757L, 23165L, 23165L, 5018L, 2449L, 23165L, 23165L, 8504L,
23165L, 23165L, 23165L, 8504L, 25786L, 23165L, 25786L, 11348L,
4794L, 23165L, 23165L, 23165L, 2449L, 6651L, 2449L, 23165L,
23165L, 878L, 24615L, 10427L, 7534L, 25786L, 9495L, 23165L,
4794L, 5026L, 23165L, 7534L, 4778L, 3757L, 2675L, 878L, 8504L,
1652L, 10427L, 23165L, 394L, 11348L, 24615L, 23165L, 23165L,
1899L, 11348L, 1899L, 25786L, 504L, 25786L, 25786L, 25786L,
5026L, 9495L, 25786L, 5026L, 7534L, 5715L, 6651L, 23165L,
23165L, 3757L, 23165L, 3757L, 23165L, 3757L, 23165L, 23165L,
3580L, 3333L, 5640L, 7534L, 5640L, 23165L, 2229L, 23165L,
3580L, 8504L)), row.names = c(NA, -100L), class = c("tbl_df",
"tbl", "data.frame"))
我的目标是基于minor_group实现最大的组内相关系数,但minor_ggroup应该至少有1000个观测值,最多有5000个观测值。如果不满足这些条件,那么我想将较小的次要组与最近的次要组聚合(意味着它们共享相同的子主要组),如果次要组仍有5000多个观测值,则根据单位组或代码将最大的次要组划分为较小的组。
它应该是一种逐步的过程(聚合和分解小群体),以实现基于“更新”的小群体的最高ICC。
如果不能再聚合或拆分组,那么代码应该会中断。
我设法想出了一些手动的、并不聪明的东西。
有什么建议吗?非常感谢。
# filter to see minor groups with less than 1000 positions.
grouped_minor <- df %>% mutate(submajor_or_minor = ifelse(count_per_minor_group < 1000, submajor_group , minor_group ))
# create a new column to count positions in new minor groups (if submajor_or_minor is 2-digit then a new minor group is created) and create a new column that shows the grouped minor_group values as a comma-separated string
grouped_minor <- grouped_minor %>%
group_by(submajor_or_minor) %>%
mutate(count_new_minor = n()) %>%
mutate(grouped_minors = paste(unique(minor_group), collapse = ", ")) %>%
ungroup() %>% as.data.frame()
# first ICC
ICC_minor1 <- lmer(formula = lnviews ~ 1 + (1|grouped_minors),data = grouped_minor)
performance::icc(ICC_minor1)
# filter to see unit_group with less than 1000 positions, those will need to be aggregated in a bigger group
grouped_minor <- grouped_minor %>% mutate(minor_or_unit = ifelse(count_per_unit_group < 1000, minor_group , unit_group))
# create a new column to count positions in new unit groups (if minor_or_unit is 3-digit then a new unit group is created) and create a new column that shows the grouped minor_group values as a comma-separated string
grouped_minor <- grouped_minor %>%
group_by(minor_or_unit) %>%
mutate(count_new_unit = n()) %>%
mutate(grouped_unit = paste(unique(unit_group), collapse = ", ")) %>%
ungroup() %>% as.data.frame()
# second ICC
ICC_minor2 <- lmer(formula = lnviews ~ 1 + (1|grouped_unit),data = grouped_minor)
performance::icc(ICC_minor2)
# filter to see unit groups with more than 5000 positions
grouped_minor <- grouped_minor %>% mutate(unit_or_code = ifelse(count_per_unit_group > 5000, code, grouped_unit ))
# if unit_or_code is 7-digit then a new code group is created
grouped_minor <- grouped_minor %>%
group_by(unit_or_code) %>%
mutate(count_new_code= n()) %>%
mutate(grouped_code= paste(unique(code), collapse = ", ")) %>%
ungroup() %>% as.data.frame()
ICC_minor3 <- lmer(formula = lnviews ~ 1 + (1|as.factor(grouped_code)),data = grouped_minor)
performance::icc(ICC_minor3)