#### BHPS and UKHLS data extraction and preparation script #### # Author: Rory Coulter, University College London # Data citations for the UK Data Service collections used in this script: # University of Essex, Institute for Social and Economic Research. # (2021). Understanding Society: Waves 1-10, 2009-2019 and Harmonised BHPS: # Waves 1-18, 1991-2009. [data collection]. 13th Edition. UK Data Service. # SN: 6614, http://doi.org/10.5255/UKDA-SN-6614-14 # Script produced using RStudio 2022 and R 4.2.1, last updated 02/05/23 # The purpose of this script is to create a long format person-year file # containing records from all individuals ever observed in either the BHPS # or UKHLS. Some key variables many users require are also extracted and # cleaned from the various study files. Users can customise the base code # contained here as needed for their particular application. # Please note that although the script has been checked, users are responsible # for further checking all extraction, harmonisation and cleaning procedures. # Although every effort has been made to check the derived files # there may be errors or suboptimal decisions. Please therefore treat this # file as a base resource which should be customised and checked to meet the # demands of your application. # To run the script requires Stata versions of SN6614 downloaded and saved into a # folder called UKDA-6614-stata. You will then need to amend the portions of file # paths marked **** in the code below to reflect your folder structure. #### Load R packages from library (install from CRAN first if needed) #### # install.packages("tidyverse") # install.packages("haven") # install.packages("lubridate") # Load packages packages <- c("tidyverse", "haven", "lubridate") lapply(packages, library, character.only = T) rm(packages) #### 1. Load cross-wave id files and convert each one to long format #### BHPS_xwaveid <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/bhps_wx/xwaveid_bh.dta") %>% mutate_if(haven::is.labelled, haven:: as_factor) # Remember to replace the **** with your file path UKHLS_xwaveid <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/ukhls_wx/xwaveid.dta") %>% mutate_if(haven::is.labelled, haven::as_factor) # Create the BHPS loop elements we need - these identify each wave wavelist <- c("ba_", "bb_", "bc_", "bd_", "be_", "bf_", "bg_", "bh_", "bi_", "bj_", "bk_", "bl_", "bm_", "bn_", "bo_", "bp_", "bq_", "br_") # Vector of waves counter <- 0 # Set wave counter indicator BHPS_long <- data.frame() # Empty dataset to fill with indicators # Extract each wave's data and bind together into long format for (wave in wavelist) { counter <- counter + 1 # Count wave number as we loop through data <- select(BHPS_xwaveid, pid, pidp, sex, birthy, sampst, psu, strata, hhorig_bh, starts_with(wave)) # Select vars we need for that wave - adjustable data <- mutate(data, wave = counter) # Generate wave variable data <- data %>% rename_all(funs(str_replace_all(., wave, ""))) # Remove wave prefixes BHPS_long <- bind_rows(BHPS_long, data) rm(data) # Discard junk data } # Note: factors coerced to characters # Tidy up environment and the 'BHPS_long' long format datafile rm(counter, wave, wavelist, BHPS_xwaveid) BHPS_long <- BHPS_long %>% arrange(pidp, wave) %>% select(pidp, pid, wave, hidp, hid, pno, sampst, strata, psu, hhorig_bh, birthy, sex, ivfho, ivfio1_bh, ivfio) # Repeat process for UKHLS records - first create looping elements wavelist <- c("a_", "b_", "c_", "d_", "e_", "f_", "g_", "h_", "i_", "j_") # Vector of wave strings counter <- 18 # Set wave counter indicator to 18 - to pick up from BHPS UKHLS_long <- data.frame() # Empty dataset # Create separate datasets for each wave for (wave in wavelist) { counter <- counter + 1 # Count wave number as we loop through data <- select(UKHLS_xwaveid, pidp, sex, birthy, sampst, psu, strata, hhorig, starts_with(wave)) # Select vars we need for that wave - adjustable data <- mutate(data, wave = counter) # Generate a wave variable data <- data %>% rename_all(funs(str_replace_all(., wave, ""))) # Remove wave prefix from variable names data <- data %>% mutate(pno = as.factor(pno)) UKHLS_long <- bind_rows(UKHLS_long, data) # Bind into master file rm(data) # Discard junk data } # Tidy up environment and the long file rm(counter, wave, wavelist, UKHLS_xwaveid) UKHLS_long <- UKHLS_long %>% arrange(pidp, wave) %>% select(pidp, wave, hidp, pno, sampst, strata, psu, hhorig, birthy, sex, ivfho, ivfio) # At this point you have two long format person-year files: one for people who # ever entered the BHPS (1991-2008) and another for those recorded in UKHLS # (2009+). This can provide the raw base for a longitudinal analysis. #### 2. Clean up key variables from the xwaveid files #### # Prepare variables for file combine UKHLS_long <- UKHLS_long %>% mutate(pno = as.numeric(as.character(pno)), # Convert pno to numeric birthy = as.factor(birthy), # Convert birthy to factor sex = as.factor(sex)) # Convert sex to factor # Build an integrated person-year file Person_file <- bind_rows(BHPS_long, UKHLS_long) %>% arrange(pidp, wave) %>% select(pidp:psu, hhorig_bh, hhorig, ivfho, ivfio1_bh, ivfio, birthy, sex) # Clean up some key identifiers Person_file <- Person_file %>% mutate(strata = as.numeric(strata), psu = as.numeric(psu)) # Missing if ECHP samples # Code up HH origin variables Person_file <- Person_file %>% mutate(hhorigin = case_when(hhorig_bh == "bhps gb (original) 1991" | hhorig == "bhps gb 1991" ~ "BHPS 1991", hhorig_bh == "bhps sco 1999" | hhorig == "bhps sco 1999" ~ "BHPS Scot 1999", hhorig_bh == "bhps wal 1999" | hhorig == "bhps wal 1999" ~ "BHPS Wales 1999", hhorig_bh == "bhps ni 2001" | hhorig == "bhps ni 2001" ~ "BHPS N.I. 2001", hhorig_bh == "ECHP - SCPR" | hhorig_bh == "ECHP - ONS" | hhorig_bh == "ECHP - NI" ~ "ECHP", hhorig == "ukhls gb 2009-10" ~ "UKHLS GPS", hhorig == "ukhls emboost 2009-10" ~ "UKHLS EMB", hhorig == "ukhls ni 2009-10" ~ "UKHLS N.I.", hhorig == "ukhls iemb 2014-15" ~ "UKHLS IEMB 2014-15")) # Code up individual interview outcome Person_file <- Person_file %>% mutate(int_outcome = case_when(ivfio1_bh == "Full interview" | ivfio == "full interview" | ivfio == "Full interview" ~ "Full interview", ivfio1_bh == "Proxy" | ivfio == "proxy interview" | ivfio == "Proxy interview" ~ "Proxy", ivfio == "telephone intvw" | ivfio == "Telephone intvw" ~ "Phone", TRUE ~ "No interview")) # Clean up birth year which has imported as character table(Person_file$birthy, useNA = "ifany") # Note some are 9898 (36 cases) and should be set to missing Person_file <- Person_file %>% mutate(doby = ifelse(birthy == "9898" | birthy == "missing" | birthy == "-1" | birthy == "-2" | birthy == "-9", NA, as.numeric(as.character(birthy)))) # Clean up sex into a female dummy Person_file <- Person_file %>% mutate(sex_cleaned = case_when(sex == "1" | sex == "Male" ~ "Male", sex == "2" | sex == "Female" ~ "Female", sex == "missing" | sex == "-9" | sex == "-2" | sex == "-1" ~"Missing")) # Tidy dataset and move on Person_file <- Person_file %>% select(pidp: hhorig, hhorigin, ivfho:ivfio, int_outcome, birthy, doby, sex, sex_cleaned) rm(BHPS_long, UKHLS_long) #### 3. Extract household interview files and clean up #### # Create a list of all paths to data to be imported: BHPS only path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*hhresp.dta", full.names = T, recursive = T) path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28) print(path_list) # Check BHPS_HH <- data.frame() # Empty datafile # Import datasets and variables from BHPS for (file in path_list) { wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() # Extract wave number (replace * with character position of the wave number in your path) # and convert this to a numeric variable data <- read_dta(paste(file)) # Import wave files data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(starts_with("hid"), # HH id starts_with("intdate"), # interview dates fihhmngrs_dv, tenure_dv, # basic HH variables gor_dv) # region data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable mutate_if(haven::is.labelled, haven:: as_factor) BHPS_HH <- bind_rows(BHPS_HH, data) rm(data) # Drop junk file } # Repeat path list process for UKHLS path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*hhresp.dta", full.names = T, recursive = T) path_list <- path_list[c(19:28)] # Discard BHPS cases from the list (rows 1-18) print(path_list) UKHLS_HH <- data.frame() # Empty datafile # Repeat the looping process across UKHLS waves - but will need to manually bind due to label issues for (file in path_list) { wave_count <- 18+(str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric()) # Extract wave no. data <- read_dta(paste(file)) data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(starts_with("hid"), # HH id starts_with("intdate"), # interview dates fihhmngrs_dv, tenure_dv, # basic HH variables gor_dv) # region data <- data %>% mutate(wave = wave_count, intdated = as.character(intdated), # Convert to character to aid binding intdatem = as.character(intdatem), # as coding frame varies from wave to wave intdatey = as.character(intdatey), fihhmngrs_dv = as.character(fihhmngrs_dv)) %>% mutate_if(haven::is.labelled, haven::as_factor) UKHLS_HH <- bind_rows(UKHLS_HH, data) rm(data) # Drop junk file } # Tidy up rm(file, path_list, wave_count) # Pool BHPS and UKHLS household interview files hhresp_all <- bind_rows(BHPS_HH, UKHLS_HH) %>% # Warnings = coercion to factors select(hid, hidp, wave, intdatem, intdatey, fihhmngrs_dv, tenure_dv, gor_dv) # Prepare basic HH vars # (1) Create interview month and year variables that includes 1991 (year) and assigns missings to NA: Month first hhresp_all <- hhresp_all %>% mutate(intmonth = case_when(intdatem %in% c("-8", "-9", "missing") ~ "Missing", intdatem %in% c("1", "January") ~ "January", intdatem %in% c("2", "February") ~ "February", intdatem %in% c("3", "March") ~ "March", intdatem %in% c("4", "April") ~ "April", intdatem %in% c("5", "May") ~ "May", intdatem %in% c("6", "June") ~ "June", intdatem %in% c("7", "July") ~ "July", intdatem %in% c("8", "August") ~ "August", intdatem %in% c("9", "September") ~ "September", intdatem %in% c("10", "October") ~ "October", intdatem %in% c("11", "November") ~ "November", intdatem %in% c("12", "December") ~ "December")) # Create year indicator hhresp_all <- hhresp_all %>% mutate(intyear = ifelse(intdatey == "-8" | intdatey == "-9" | intdatey == "missing", NA, intdatey), intyear = ifelse(wave == 1 & is.na(intyear), "1991", intyear)) # assign year = 1991 if surveyed in W1 # Combine month and year of interview into a date variable using lubridate hhresp_all <- hhresp_all %>% mutate(intdate = paste(intmonth, intyear, sep = ","), intdate = myd(intdate, truncated = 1)) # Note: if missing month or year intdate is NA # (2) Create cleaned regional variable hhresp_all <- hhresp_all %>% mutate(region = case_when(gor_dv %in% c("east midlands", "East Midlands") ~ "East Midlands", gor_dv %in% c("east of england", "East of England") ~ "East of England", gor_dv %in% c("london", "London") ~ "London", gor_dv %in% c("north east", "North East") ~ "North East", gor_dv %in% c("north west", "North West") ~ "North West", gor_dv %in% c("northern ireland", "Northern Ireland") ~ "Northern Ireland", gor_dv %in% c("scotland", "Scotland") ~ "Scotland", gor_dv %in% c("south east", "South East") ~ "South East", gor_dv %in% c("south west", "South West", "Channel Islands") ~ "South West", gor_dv %in% c("wales", "Wales") ~ "Wales", gor_dv %in% c("west midlands", "West Midlands") ~ "West Midlands", gor_dv %in% c("yorkshire and the humber", "Yorkshire and the Humber") ~ "Yorkshire and the Humber", gor_dv == "missing" ~ "Missing")) # (3) HH income as numeric variable hhresp_all <- hhresp_all %>% mutate(hh_monthly_inc = as.numeric(fihhmngrs_dv)) # Merge household interview data onto master datafile and tidy up Person_file <- merge(Person_file, hhresp_all, by = c("hidp", "wave"), all.x = T) Person_file <- Person_file %>% arrange(pidp, wave) %>% select(pidp, pid, hidp, hid.x, wave, pno:ivfho, intdatem, intdatey, intmonth, intyear, intdate, ivfio1_bh:sex_cleaned, fihhmngrs_dv, hh_monthly_inc, tenure_dv, gor_dv, region) %>% rename("hid" = "hid.x") rm(BHPS_HH, UKHLS_HH, hhresp_all) # The Person_file now comprises a person-year file with merged HH response # information derived from the household questionnaire. #### 4. Extract xwavedat time-constant individual variables and clean up #### UKHLS_xwavedat <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/ukhls_wx/xwavedat.dta") UKHLS_xwavedat <- UKHLS_xwavedat %>% select(pidp, bornuk_dv, racel_dv) %>% # Ethnicity, UK birth mutate_if(haven::is.labelled, haven:: as_factor) %>% mutate(pidp = as.numeric(as.character(pidp))) # Prepare UK born dummy UKHLS_xwavedat <- UKHLS_xwavedat %>% mutate(babroad = case_when(bornuk_dv %in% c("missing", "inapplicable", "refusal", "don't know") ~ "Missing", bornuk_dv == "born in uk" ~ "UK born", bornuk_dv == "not born in uk" ~ "Born overseas")) # Merge cross-wave data onto the master file Person_file <- merge(Person_file, UKHLS_xwavedat, by = "pidp", all.x = T) rm(UKHLS_xwavedat) # Tidy up Person_file <- Person_file %>% select(pidp:sex_cleaned, bornuk_dv, babroad, racel_dv, fihhmngrs_dv:region) #### 5. Import and prepare indall enumeration variables #### # Create a list of all paths to data to be imported: BHPS only path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indall.dta", full.names = T, recursive = T) path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28) print(path_list) BHPS_indall <- data.frame() # Empty input dataframe # Import datasets and variables from BHPS for (file in path_list) { wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() # Extract wave number (input digit position of wave numbers into the file path) and convert to numeric data <- read_dta(paste(file)) # Import wave files data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(pidp, # Core id variables age, ppno, nchild_dv) data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable mutate_if(haven::is.labelled, haven::as_factor) BHPS_indall <- bind_rows(BHPS_indall, data) rm(data) # Drop junk file } # Repeat extraction for UKHLS cases path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indall.dta", full.names = T, recursive = T) path_list <- path_list[c(19:28)] # Discard BHPS print(path_list) UKHLS_indall <- data.frame() # Empty input dataframe for (file in path_list) { wave_count <- 18+(str_sub(paste(file), *, *)) %>% str_remove_all("[w/]") %>% as.numeric() # Replace the * with the character position of the wave numbers in your path data <- read_dta(paste(file)) # Import wave files data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(pidp, # Core id variables age_dv, ppno, nchild_dv) data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable mutate_if(haven::is.labelled, haven::as_factor) UKHLS_indall <- bind_rows(UKHLS_indall, data) rm(data) # Drop junk file } # Pool waves UKHLS_indall <- UKHLS_indall %>% mutate(nchild_dv = as.character(nchild_dv)) indall <- bind_rows(BHPS_indall, UKHLS_indall) # Clean variables up: (1) age indall <- indall %>% mutate(respage = ifelse(wave <= 18, as.numeric(as.character(age)), as.numeric(as.character(age_dv)))) # Note: NA induced by coercion warning can be ignored # (2) Living with kids under 16 dummy variable (as parent figure) indall <- indall %>% mutate(kids = case_when(nchild_dv == "-9" ~ "Missing", nchild_dv == "0" ~ "No kids", nchild_dv %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10") ~ "Kids")) # (3) Living with a partner/spouse and their pno indall <- indall %>% mutate(partnered = case_when(ppno %in% c("don't know", "missing", "refusal") ~ "Missing", ppno %in% c("No Spouse in HH", "partner not in hh") ~ "No partner", ppno %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16") ~ "Partner"), partpno = ifelse(partnered == "Partner", as.numeric(as.character(ppno)), NA)) # Merge onto master and tidy up Person_file <- merge(Person_file, indall, by = c("pidp", "wave"), all.x = T) Person_file <- Person_file %>% select(pidp, pid, wave, hidp:doby, age, age_dv, respage, sex:racel_dv, ppno, partnered, partpno, nchild_dv, kids, fihhmngrs_dv:region) %>% arrange(pidp, wave) rm(BHPS_indall, UKHLS_indall, indall, file, path_list, wave_count) #### 6. Import and prepare key indresp individual interview variables #### # BHPS importation path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indresp.dta", full.names = T, recursive = T) path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28) print(path_list) BHPS_indresp <- data.frame() # Import datasets and individual interview variables from BHPS for (file in path_list) { wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() data <- read_dta(paste(file)) # Import wave files data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(pidp, # Core id variables hiqual_dv, jbstat, fimngrs_dv, jbhas, jboff, jbnssec8_dv, starts_with("lrwght"), starts_with("lrwt")) # Keep longitudinal weights data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable mutate_if(haven::is.labelled, haven::as_factor) BHPS_indresp <- bind_rows(BHPS_indresp, data) rm(data) # Drop junk file } # Repeat listing path process for UKHLS path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indresp.dta", full.names = T, recursive = T) path_list <- path_list[c(19:28)] # Discard BHPS cases print(path_list) UKHLS_indresp <- data.frame() # Extract UKHLS datasets for (file in path_list) { wave_count <- 18 + str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() data <- read_dta(paste(file)) # Import wave files data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars data <- data %>% select(pidp, # Core id variables hiqual_dv, jbstat, fimngrs_dv, jbhas, jboff, jbnssec8_dv, starts_with("indinus"), starts_with("indinub")) # Keep longitudinal weights data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable mutate_if(haven::is.labelled, haven::as_factor) UKHLS_indresp <- bind_rows(UKHLS_indresp, data) rm(data) # Drop junk file } # Bind the BHPS and UKHLS indresp files together indresp <- bind_rows(BHPS_indresp, UKHLS_indresp) # Start cleaning some key variables we need: (1) degree qualifications indresp <- indresp %>% mutate(degree = case_when(hiqual_dv %in% c("missing", "Missing", "inapplicable") ~ "Missing", hiqual_dv %in% c("Degree", "Other higher", "Other higher degree") ~ "Degree", TRUE ~ "No degree")) # (2) Generate ILO employment status variable (employed) using jbhas and jboff indresp <- indresp %>% mutate(jbhas_cleaned = case_when(jbhas %in% c("don't know", "inapplicable", "missing", "refusal") ~ "Missing", jbhas %in% c("no", "No") ~ "No", jbhas %in% c("yes", "Yes") ~ "Yes"), jboff_cleaned = case_when(jboff %in% c("don't know", "inapplicable", "missing", "refusal") ~ "Missing", jboff %in% c("no", "No", "Waiting for job", "waiting to take up job", "Waiting to take up job") ~ "No", jboff %in% c("yes", "Yes") ~ "Yes")) indresp <- indresp %>% mutate(employed = case_when(jbhas_cleaned == "Missing" | (jbhas_cleaned == "No" & jboff_cleaned == "Missing") ~ "Missing", jbhas_cleaned == "Yes" | (jbhas_cleaned != "Missing" & jboff_cleaned == "Yes") ~ "Employed", jbhas_cleaned == "No" & jboff_cleaned == "No" ~ "Not employed")) # (3) NSSEC of occupation for those currently working indresp <- indresp %>% mutate(emp_type = case_when(employed == "Missing" ~ "Missing", employed == "Not employed" ~ "Not employed", employed == "Employed" & jbnssec8_dv %in% c("Higher professional", "Lower management & professional", "Large employers & higher management") ~ "Employed, manag/prof", employed == "Employed" & jbnssec8_dv %in% c("Intermediate", "Lower supervisory & technical", "Routine", "Semi-routine", "Small employers & own account") ~ "Employed, other", employed == "Employed" & jbnssec8_dv %in% c("don't know", "inapplicable", "missing") ~ "Employed, no NSSEC")) # (4) Individual income from all sources indresp<- indresp%>% mutate(fimngrs_dv = as.numeric(as.character(fimngrs_dv))) indresp <- indresp %>% mutate(indinc = ifelse(fimngrs_dv<0, 0, fimngrs_dv)) # Code negatives to be 0 summary(indresp$fimngrs_dv) summary(indresp$indinc) # Merge onto master person year file and tidy up Person_file <-merge(Person_file, indresp, by = c("pidp", "wave"), all.x = T) Person_file <- Person_file %>% select(pidp:kids, hiqual_dv, degree, jbstat, jbhas, jbhas_cleaned, jboff, jboff_cleaned, employed, jbnssec8_dv, emp_type, fimngrs_dv, indinc, fihhmngrs_dv:region, lrwght:indinub_lw) rm(indresp, BHPS_indresp, UKHLS_indresp, file, path_list, wave_count) # The Person_file dataset contains 2,038,636 person-year observations for # everyone observed in either BHPS or UKHLS. Many of the rows are empty as the # person was not actually interviewed in that sweep. # 61 variables are included on the base person-year file. Users can adapt the # above code to add, discard and derive additional variables as required. # If you use this file in your work please include a citation to the data # collection. Do get in touch if you find any problems or errors in the code.