编辑:根据评论更新。
您可以添加一列,指示行是否是示例的一部分。因此,不妨尝试以下方法:
df = data.frame(year= c(1,1,1,1,1,1,2,2,2,2,2,2), id=c(1,2,3,4,5,6,7,8,9,10,11,12),age=c(7,7,7,12,12,12,7,7,7,12,12,12))
library(dplyr)
n_per_year_low_age = 2
n_per_year_high_age = 1
df <- df %>% group_by(year) %>%
mutate(in_sample1 = as.numeric(id %in% sample(id[age<8],n_per_year_low_age))) %>%
mutate(in_sample2 = as.numeric(id %in% sample(id[age>8],n_per_year_high_age))) %>%
mutate(in_sample = in_sample1+in_sample2) %>%
select(-in_sample1,-in_sample2)
输出:
# A tibble: 12 x 4
# Groups: year [2]
year id age in_sample
<dbl> <dbl> <dbl> <dbl>
1 1.00 1.00 7.00 1.00
2 1.00 2.00 7.00 1.00
3 1.00 3.00 7.00 0
4 1.00 4.00 12.0 1.00
5 1.00 5.00 12.0 0
6 1.00 6.00 12.0 0
7 2.00 7.00 7.00 1.00
8 2.00 8.00 7.00 0
9 2.00 9.00 7.00 1.00
10 2.00 10.0 12.0 0
11 2.00 11.0 12.0 0
12 2.00 12.0 12.0 1.00
接下来的操作就很简单了:
# extracting your sample
df %>% filter(in_sample==1)
# comparing statistics of your sample against the rest of the population
df %>% group_by(year,in_sample) %>% summarize(mean(id))