file <- "/Users/pc/Downloads/hate-crime-2021/2021_HC_NATIONAL_MASTER_FILE_ENC.txt"
library(tidyverse)
# read file in line by line, and make each line it's own string
df <- readLines(file) |>
as_tibble() |>
mutate(BH_group = cumsum(grepl("^BH", value))) # each time the line starts with BH, we know we are starting a new batch header row, so we increment our group number by 1
# since the data is fixed width, we use the separate_wider_position function to split the data into columns
# this is a huge pain to do, so I will leave doing the ir ones to you. You can find the details on how wide each column is meant to be in the help file you find when downloading the dataset
bh_df <- df[grepl("^BH", df$value),] |>
separate_wider_position(value, widths = c(type = 2,
state = 2,
ori = 9,
incident_number = 12,
date_added = 8,
date_ori_went_nibrs = 8,
city = 30,
state_abb = 2,
pop_group = 2,
country_div = 1,
country_region = 1,
agency_ind = 1,
core_city = 1,
covered_by_ori = 9,
fbi_field_office = 4,
judicial_district = 4,
agency_nibrs_flag = 1,
date_agency_inactive = 8,
current_pop1 = 9,
ucr_county_code1 = 3,
msa_code1 = 3,
last_pop1 = 9,
current_pop2 = 9,
ucr_county_code2 = 3,
msa_code2 = 3,
last_pop2 = 9,
current_pop3 = 9,
ucr_county_code3 = 3,
msa_code3 = 3,
last_pop3 = 9,
current_pop4 = 9,
ucr_county_code4 = 3,
msa_code4 = 3,
last_pop4 = 9,
current_pop5 = 9,
ucr_county_code5 = 3,
msa_code5 = 3,
last_pop5 = 9,
master_file_year = 4,
state_first_quarter_activy = 1,
state_second_quarter_activity = 1,
state_third_quarter_activity = 1,
state_fourth_quarter_activity = 1,
fed_first_quarter_activy = 1,
fed_second_quarter_activity = 1,
fed_third_quarter_activity = 1,
fed_fourth_quarter_activity = 1,
agency_name = 30,
fips_county1 = 3,
fips_county2 = 3,
fips_county3 = 3,
fips_county4 = 3,
fips_county5 = 3
))
ir_df <- df[grepl("^IR", df$X1),]
# now do something similar, but with the widths for the ir data! :-)
输出
# A tibble: 25,298 Ã 53
type state ori incident_number date_added date_ori_went_nibrs city
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 BH 50 AK0010100 000000000000 " " " " "ANCHORâ¦
2 BH 50 AK0010200 000000000000 " " "20210101" "FAIRBAâ¦
3 BH 50 AK0010300 000000000000 " " " " "JUNEAUâ¦
4 BH 50 AK0010400 000000000000 " " "20210101" "KETCHIâ¦
5 BH 50 AK0010500 000000000000 " " "20210101" "KODIAKâ¦
6 BH 50 AK0010600 000000000000 " " "20210101" "NOME â¦
7 BH 50 AK0010700 000000000000 " " "20210101" "PETERSâ¦
8 BH 50 AK0010800 000000000000 " " "20210101" "SEWARDâ¦
9 BH 50 AK0010900 000000000000 " " "20210101" "SITKA â¦
10 BH 50 AK0011000 000000000000 " " "20210101" "SKAGWAâ¦
# â¹ 25,288 more rows
# â¹ 46 more variables: state_abb <chr>, pop_group <chr>, country_div <chr>,
# country_region <chr>, agency_ind <chr>, core_city <chr>,
# covered_by_ori <chr>, fbi_field_office <chr>, judicial_district <chr>,
# agency_nibrs_flag <chr>, date_agency_inactive <chr>, current_pop1 <chr>,
# ucr_county_code1 <chr>, msa_code1 <chr>, last_pop1 <chr>,
# current_pop2 <chr>, ucr_county_code2 <chr>, msa_code2 <chr>, â¦
# â¹ Use `print(n = ...)` to see more rows
处理完IR数据帧后,可以执行以下操作:
left_join(ir_df,
bh_df |> select(ori, BH_group), by = "BH_group")
左联接保留第一个数据帧的所有行,并添加第二个数据帧中关键字匹配的列(因此在这种情况下,它(例如)将第一个BH_group的ori添加到ir_df中BH_group==1的所有行。