#### BHPS and UKHLS data extraction and preparation script ####



# Author: Rory Coulter, University College London


# Data citations for the UK Data Service collections used in this script:

# University of Essex, Institute for Social and Economic Research. 
# (2021). Understanding Society: Waves 1-10, 2009-2019 and Harmonised BHPS: 
# Waves 1-18, 1991-2009. [data collection]. 13th Edition. UK Data Service. 
# SN: 6614, http://doi.org/10.5255/UKDA-SN-6614-14



# Script produced using RStudio 2022 and R 4.2.1, last updated 02/05/23



# The purpose of this script is to create a long format person-year file 
# containing records from all individuals ever observed in either the BHPS
# or UKHLS. Some key variables many users require are also extracted and 
# cleaned from the various study files. Users can customise the base code
# contained here as needed for their particular application.


# Please note that although the script has been checked, users are responsible 
# for further checking all extraction, harmonisation and cleaning procedures. 
# Although every effort has been made to check the derived files 
# there may be errors or suboptimal decisions. Please therefore treat this 
# file as a base resource which should be customised and checked to meet the 
# demands of your application. 


# To run the script requires Stata versions of SN6614 downloaded and saved into a
# folder called UKDA-6614-stata. You will then need to amend the portions of file 
# paths marked **** in the code below to reflect your folder structure.




#### Load R packages from library (install from CRAN first if needed) ####



# install.packages("tidyverse") 

# install.packages("haven")

# install.packages("lubridate")


# Load packages
packages <- c("tidyverse", "haven", "lubridate")

lapply(packages, library, character.only = T)

rm(packages)




#### 1. Load cross-wave id files and convert each one to long format ####




BHPS_xwaveid <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/bhps_wx/xwaveid_bh.dta") %>%
                mutate_if(haven::is.labelled, haven:: as_factor)
                # Remember to replace the **** with your file path


UKHLS_xwaveid <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/ukhls_wx/xwaveid.dta") %>%
                 mutate_if(haven::is.labelled, haven::as_factor)




# Create the BHPS loop elements we need - these identify each wave
wavelist <- c("ba_", "bb_", "bc_", "bd_", "be_", "bf_", "bg_", "bh_", "bi_",
              "bj_", "bk_", "bl_", "bm_", "bn_", "bo_", "bp_", "bq_", "br_") 
            # Vector of waves


counter <- 0 # Set wave counter indicator


BHPS_long <- data.frame() # Empty dataset to fill with indicators



# Extract each wave's data and bind together into long format
for (wave in wavelist) {
  
  counter <- counter + 1 # Count wave number as we loop through
  
  data <- select(BHPS_xwaveid, pid, pidp, sex, birthy, sampst, psu, strata, hhorig_bh,
                 starts_with(wave)) # Select vars we need for that wave - adjustable
  
  data <- mutate(data, wave = counter) # Generate wave variable
                 
  data <- data %>% rename_all(funs(str_replace_all(., wave, ""))) # Remove wave prefixes
  
  BHPS_long <- bind_rows(BHPS_long, data)
  
  rm(data) # Discard junk data
  
} # Note: factors coerced to characters



# Tidy up environment and the 'BHPS_long' long format datafile
rm(counter, wave, wavelist, BHPS_xwaveid)


BHPS_long <- BHPS_long %>% arrange(pidp, wave) %>%
             select(pidp, pid, wave, hidp, hid, pno, sampst, strata, psu, 
                    hhorig_bh, birthy, sex,
                    ivfho, ivfio1_bh, ivfio)



# Repeat process for UKHLS records - first create looping elements
wavelist <- c("a_", "b_", "c_", "d_", "e_", "f_", "g_", "h_", "i_", "j_") # Vector of wave strings

counter <- 18 # Set wave counter indicator to 18 - to pick up from BHPS 

UKHLS_long <- data.frame() # Empty dataset
 


# Create separate datasets for each wave
for (wave in wavelist) {
  
  counter <- counter + 1 # Count wave number as we loop through
  
  data <- select(UKHLS_xwaveid, pidp, sex, birthy, sampst, psu, strata, hhorig,
                 starts_with(wave)) # Select vars we need for that wave - adjustable
  
  data <- mutate(data, wave = counter) # Generate a wave variable
  
  data <- data %>% rename_all(funs(str_replace_all(., wave, ""))) # Remove wave prefix from variable names
  
  data <- data %>% mutate(pno = as.factor(pno))
  
  UKHLS_long <- bind_rows(UKHLS_long, data) # Bind into master file
  
  rm(data) # Discard junk data
  
}



# Tidy up environment and the long file
rm(counter, wave, wavelist, UKHLS_xwaveid)


UKHLS_long <- UKHLS_long %>% arrange(pidp, wave) %>%
              select(pidp, wave, hidp, pno, sampst, strata, psu, hhorig, 
                     birthy, sex, ivfho, ivfio)


# At this point you have two long format person-year files: one for people who
# ever entered the BHPS (1991-2008) and another for those recorded in UKHLS
# (2009+). This can provide the raw base for a longitudinal analysis.




#### 2. Clean up key variables from the xwaveid files ####



# Prepare variables for file combine
UKHLS_long <- UKHLS_long %>% mutate(pno = as.numeric(as.character(pno)), # Convert pno to numeric
                                    birthy = as.factor(birthy), # Convert birthy to factor
                                    sex = as.factor(sex)) # Convert sex to factor


# Build an integrated person-year file
Person_file <- bind_rows(BHPS_long, UKHLS_long) %>% 
               arrange(pidp, wave) %>%
               select(pidp:psu, hhorig_bh, hhorig, ivfho, ivfio1_bh, ivfio,
                      birthy, sex)



# Clean up some key identifiers
Person_file <- Person_file %>% mutate(strata = as.numeric(strata),
                                      psu = as.numeric(psu)) # Missing if ECHP samples


# Code up HH origin variables
Person_file <- Person_file %>% mutate(hhorigin = case_when(hhorig_bh == "bhps gb (original) 1991" |
                                                           hhorig == "bhps gb 1991"
                                                           ~ "BHPS 1991",
                                                           
                                                           hhorig_bh == "bhps sco 1999" |
                                                           hhorig == "bhps sco 1999"
                                                           ~ "BHPS Scot 1999",
                                                           
                                                           hhorig_bh == "bhps wal 1999" |
                                                           hhorig == "bhps wal 1999"
                                                           ~ "BHPS Wales 1999",
                                                           
                                                           hhorig_bh == "bhps ni 2001" |
                                                           hhorig == "bhps ni 2001"
                                                           ~ "BHPS N.I. 2001",
                                                           
                                                           hhorig_bh == "ECHP - SCPR" |
                                                           hhorig_bh == "ECHP - ONS" |
                                                           hhorig_bh == "ECHP - NI"
                                                           ~ "ECHP",
                                                           
                                                           hhorig == "ukhls gb 2009-10"
                                                           ~ "UKHLS GPS",
                                                           
                                                           hhorig == "ukhls emboost 2009-10"
                                                           ~ "UKHLS EMB",
                                                           
                                                           hhorig == "ukhls ni 2009-10"
                                                           ~ "UKHLS N.I.",
                                                           
                                                           hhorig == "ukhls iemb 2014-15"
                                                           ~ "UKHLS IEMB 2014-15"))


# Code up individual interview outcome
Person_file <- Person_file %>% mutate(int_outcome = case_when(ivfio1_bh == "Full interview" |
                                                              ivfio == "full interview" |
                                                              ivfio == "Full interview"
                                                              ~ "Full interview",
                                                              
                                                              ivfio1_bh == "Proxy" |
                                                              ivfio == "proxy interview" |
                                                              ivfio == "Proxy interview"
                                                              ~ "Proxy",
                                                              
                                                              ivfio == "telephone intvw" |
                                                              ivfio == "Telephone intvw"
                                                              ~ "Phone",
                                                              
                                                              TRUE ~ "No interview"))

# Clean up birth year which has imported as character
table(Person_file$birthy, useNA = "ifany") # Note some are 9898 (36 cases) and should be set to missing

Person_file <- Person_file %>% mutate(doby = ifelse(birthy == "9898" | birthy == "missing" |
                                                    birthy == "-1" | birthy == "-2" | birthy == "-9",
                                                    NA, as.numeric(as.character(birthy))))



# Clean up sex into a female dummy
Person_file <- Person_file %>% mutate(sex_cleaned = case_when(sex == "1" | sex == "Male"
                                                              ~ "Male",
                                                              
                                                              sex == "2" | sex == "Female"
                                                              ~ "Female",
                                                              
                                                              sex == "missing" | sex == "-9" |
                                                              sex == "-2" | sex == "-1"
                                                              ~"Missing"))



# Tidy dataset and move on
Person_file <- Person_file %>% select(pidp: hhorig, hhorigin, ivfho:ivfio,
                                      int_outcome, birthy, doby, sex, sex_cleaned)


rm(BHPS_long, UKHLS_long)




#### 3. Extract household interview files and clean up ####



# Create a list of all paths to data to be imported: BHPS only
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*hhresp.dta", full.names = T, recursive = T)

path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28)

print(path_list) # Check


BHPS_HH <- data.frame() # Empty datafile



# Import datasets and variables from BHPS
for (file in path_list) {
  
  wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() 
  # Extract wave number (replace * with character position of the wave number in your path) 
  # and convert this to a numeric variable
  
  data <- read_dta(paste(file)) # Import wave files
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(starts_with("hid"), # HH id
                          starts_with("intdate"), # interview dates
                          fihhmngrs_dv, tenure_dv, # basic HH variables 
                          gor_dv) # region
  
  data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable
                   mutate_if(haven::is.labelled, haven:: as_factor)
  
  BHPS_HH <- bind_rows(BHPS_HH, data)
  
  rm(data) # Drop junk file
  
}



# Repeat path list process for UKHLS
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*hhresp.dta", full.names = T, recursive = T)

path_list <- path_list[c(19:28)] # Discard BHPS cases from the list (rows 1-18)

print(path_list)

UKHLS_HH <- data.frame() # Empty datafile



# Repeat the looping process across UKHLS waves - but will need to manually bind due to label issues
for (file in path_list) {
  
  wave_count <- 18+(str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric()) # Extract wave no.
  
  data <- read_dta(paste(file))
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(starts_with("hid"), # HH id
                          starts_with("intdate"), # interview dates
                          fihhmngrs_dv, tenure_dv, # basic HH variables 
                          gor_dv)  # region
                 
  
  data <- data %>% mutate(wave = wave_count,
                          intdated = as.character(intdated), # Convert to character to aid binding
                          intdatem = as.character(intdatem), # as coding frame varies from wave to wave
                          intdatey = as.character(intdatey),
                          fihhmngrs_dv = as.character(fihhmngrs_dv)) %>% 
                   mutate_if(haven::is.labelled, haven::as_factor) 
                  
  
  UKHLS_HH <- bind_rows(UKHLS_HH, data)
  
  rm(data) # Drop junk file
  
}



# Tidy up
rm(file, path_list, wave_count)



# Pool BHPS and UKHLS household interview files 
hhresp_all <- bind_rows(BHPS_HH, UKHLS_HH) %>% # Warnings = coercion to factors
              select(hid, hidp, wave, intdatem, intdatey, fihhmngrs_dv,
                     tenure_dv, gor_dv)



# Prepare basic HH vars
# (1) Create interview month and year variables that includes 1991 (year) and assigns missings to NA: Month first
hhresp_all <- hhresp_all %>% mutate(intmonth = case_when(intdatem %in% c("-8", "-9", "missing") ~ "Missing",
                                                         
                                                         intdatem %in% c("1", "January") ~ "January",
                                                         
                                                         intdatem %in% c("2", "February") ~ "February",
                                                         
                                                         intdatem %in% c("3", "March") ~ "March",
                                                         
                                                         intdatem %in% c("4", "April") ~ "April",
                                                         
                                                         intdatem %in% c("5", "May") ~ "May",
                                                         
                                                         intdatem %in% c("6", "June") ~ "June",
                                                         
                                                         intdatem %in% c("7", "July") ~ "July",
                                                         
                                                         intdatem %in% c("8", "August") ~ "August",
                                                         
                                                         intdatem %in% c("9", "September") ~ "September",
                                                         
                                                         intdatem %in% c("10", "October") ~ "October",
                                                         
                                                         intdatem %in% c("11", "November") ~ "November",
                                                         
                                                         intdatem %in% c("12", "December") ~ "December"))


# Create year indicator
hhresp_all <- hhresp_all %>% mutate(intyear = ifelse(intdatey == "-8" | intdatey == "-9" | intdatey == "missing", 
                                                     NA, intdatey),
                                    
                                    intyear = ifelse(wave == 1 & is.na(intyear), "1991", intyear))
                                    # assign year = 1991 if surveyed in W1


# Combine month and year of interview into a date variable using lubridate
hhresp_all <- hhresp_all %>% mutate(intdate = paste(intmonth, intyear, sep = ","),
                                    
                                    intdate = myd(intdate, truncated = 1))
                             # Note: if missing month or year intdate is NA



# (2) Create cleaned regional variable
hhresp_all <- hhresp_all %>% mutate(region = case_when(gor_dv %in% c("east midlands", "East Midlands")
                                                                   ~ "East Midlands",
                                                       
                                                       gor_dv %in% c("east of england", "East of England")
                                                                   ~ "East of England",
                                                       
                                                       gor_dv %in% c("london", "London")
                                                                   ~ "London",
                                                       
                                                       gor_dv %in% c("north east", "North East")
                                                                   ~ "North East",
                                                       
                                                       gor_dv %in% c("north west", "North West")
                                                                    ~ "North West",
                                                       
                                                       gor_dv %in% c("northern ireland", "Northern Ireland")
                                                                   ~ "Northern Ireland",
                                                       
                                                       gor_dv %in% c("scotland", "Scotland")
                                                                   ~ "Scotland",
                                                       
                                                       gor_dv %in% c("south east", "South East")
                                                                   ~ "South East",
                                                       
                                                       gor_dv %in% c("south west", "South West", "Channel Islands")
                                                                   ~ "South West",
                                                       
                                                       gor_dv %in% c("wales", "Wales")
                                                                   ~ "Wales",
                                                       
                                                       gor_dv %in% c("west midlands", "West Midlands")
                                                                  ~ "West Midlands",
                                                       
                                                       gor_dv %in% c("yorkshire and the humber",
                                                                     "Yorkshire and the Humber")
                                                                  ~ "Yorkshire and the Humber",
                                    
                                                       gor_dv == "missing" ~ "Missing"))



# (3) HH income as numeric variable 
hhresp_all <- hhresp_all %>% mutate(hh_monthly_inc = as.numeric(fihhmngrs_dv))




# Merge household interview data onto master datafile and tidy up
Person_file <- merge(Person_file, hhresp_all,
                     by = c("hidp", "wave"),
                     all.x = T)


Person_file <- Person_file %>% arrange(pidp, wave) %>%
                               select(pidp, pid, hidp, hid.x, wave, pno:ivfho, intdatem, intdatey, intmonth, intyear,
                                      intdate, ivfio1_bh:sex_cleaned, fihhmngrs_dv, hh_monthly_inc, 
                                      tenure_dv, gor_dv, region) %>%
                               rename("hid" = "hid.x") 


rm(BHPS_HH, UKHLS_HH, hhresp_all)



# The Person_file now comprises a person-year file with merged HH response 
# information derived from the household questionnaire.




#### 4. Extract xwavedat time-constant individual variables and clean up ####



UKHLS_xwavedat <- read_dta(file = "****/UKDA-6614-stata/stata/stata13_se/ukhls_wx/xwavedat.dta")


UKHLS_xwavedat <- UKHLS_xwavedat %>% select(pidp, bornuk_dv, racel_dv) %>% # Ethnicity, UK birth
                                     mutate_if(haven::is.labelled, haven:: as_factor) %>%
                                     mutate(pidp = as.numeric(as.character(pidp)))



# Prepare UK born dummy
UKHLS_xwavedat <- UKHLS_xwavedat %>% mutate(babroad = case_when(bornuk_dv %in% c("missing", "inapplicable",
                                                                                 "refusal", "don't know")
                                                                               ~ "Missing",
                                                                
                                                                bornuk_dv == "born in uk" ~ "UK born",
                                                                
                                                                bornuk_dv == "not born in uk" ~ "Born overseas"))


# Merge cross-wave data onto the master file
Person_file <- merge(Person_file, UKHLS_xwavedat,
                     by = "pidp", all.x = T)


rm(UKHLS_xwavedat) # Tidy up


Person_file <- Person_file %>% select(pidp:sex_cleaned, bornuk_dv, babroad,
                                      racel_dv, fihhmngrs_dv:region)




#### 5. Import and prepare indall enumeration variables ####



# Create a list of all paths to data to be imported: BHPS only
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indall.dta", full.names = T, recursive = T)

path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28)

print(path_list)

BHPS_indall <- data.frame() # Empty input dataframe



# Import datasets and variables from BHPS
for (file in path_list) {
  
  wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() 
  # Extract wave number (input digit position of wave numbers into the file path) and convert to numeric
  
  data <- read_dta(paste(file)) # Import wave files
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(pidp, # Core id variables
                          age, ppno, nchild_dv)
  
  data <- data %>% mutate(wave = wave_count) %>%  # Generate wave variable
                   mutate_if(haven::is.labelled, haven::as_factor)
  
  BHPS_indall <- bind_rows(BHPS_indall, data)
    
  rm(data) # Drop junk file
  
}




# Repeat extraction for UKHLS cases
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indall.dta", full.names = T, recursive = T)

path_list <- path_list[c(19:28)] # Discard BHPS

print(path_list)

UKHLS_indall <- data.frame() # Empty input dataframe


for (file in path_list) {
  
  wave_count <- 18+(str_sub(paste(file), *, *)) %>% str_remove_all("[w/]") %>% as.numeric() 
  # Replace the * with the character position of the wave numbers in your path

  data <- read_dta(paste(file)) # Import wave files
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(pidp, # Core id variables
                          age_dv, ppno, nchild_dv)
  
  data <- data %>% mutate(wave = wave_count) %>%  # Generate wave variable
                   mutate_if(haven::is.labelled, haven::as_factor)
  
  UKHLS_indall <- bind_rows(UKHLS_indall, data)
  
  rm(data) # Drop junk file
  
}



# Pool waves
UKHLS_indall <- UKHLS_indall %>% mutate(nchild_dv = as.character(nchild_dv))

indall <- bind_rows(BHPS_indall, UKHLS_indall)



# Clean variables up: (1) age
indall <- indall %>% mutate(respage = ifelse(wave <= 18, 
                                             as.numeric(as.character(age)), 
                                                         
                                             as.numeric(as.character(age_dv))))
# Note: NA induced by coercion warning can be ignored



# (2) Living with kids under 16 dummy variable (as parent figure)
indall <- indall %>% mutate(kids = case_when(nchild_dv == "-9" ~ "Missing",
                                             
                                             nchild_dv == "0" ~ "No kids",
                                             
                                             nchild_dv %in% c("1", "2", "3",
                                                              "4", "5", "6",
                                                              "7", "8", "9",
                                                              "10") ~ "Kids"))



# (3) Living with a partner/spouse and their pno
indall <- indall %>% mutate(partnered = case_when(ppno %in% c("don't know", "missing", "refusal")
                                                  ~ "Missing",
                                                  
                                                  ppno %in% c("No Spouse in HH", "partner not in hh")
                                                  ~ "No partner",
                                                  
                                                  ppno %in% c("1", "2", "3", "4", "5", "6", "7", "8",
                                                              "9", "10", "11", "12", "13", "14", "15",
                                                              "16") ~ "Partner"),
                            
                            partpno = ifelse(partnered == "Partner", as.numeric(as.character(ppno)), NA))



# Merge onto master and tidy up
Person_file <- merge(Person_file, indall,
                     by = c("pidp", "wave"),
                     all.x = T)


Person_file <- Person_file %>% select(pidp, pid, wave, hidp:doby, age, age_dv, respage,
                                      sex:racel_dv, ppno, partnered, partpno, nchild_dv, kids,
                                      fihhmngrs_dv:region) %>%
                               arrange(pidp, wave)


rm(BHPS_indall, UKHLS_indall, indall, file, path_list, wave_count)                                    
                                              
                                              
                                                
                                    
#### 6. Import and prepare key indresp individual interview variables ####



# BHPS importation
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indresp.dta", full.names = T, recursive = T)

path_list <- path_list[-c(19:28)] # Discard UKHLS cases from the list (rows 19-28)

print(path_list)

BHPS_indresp <- data.frame()



# Import datasets and individual interview variables from BHPS
for (file in path_list) {
  
  wave_count <- str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() 
  
  data <- read_dta(paste(file)) # Import wave files
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(pidp, # Core id variables
                          hiqual_dv, jbstat, fimngrs_dv,
                          jbhas, jboff, jbnssec8_dv,
                          starts_with("lrwght"),
                          starts_with("lrwt")) # Keep longitudinal weights
  
  data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable
                   mutate_if(haven::is.labelled, haven::as_factor)
    
  BHPS_indresp <- bind_rows(BHPS_indresp, data)
  
  rm(data) # Drop junk file
  
}



# Repeat listing path process for UKHLS
path_list <- dir(path ="****/UKDA-6614-stata/stata/stata13_se", pattern = "*indresp.dta", full.names = T, recursive = T)

path_list <- path_list[c(19:28)] # Discard BHPS cases

print(path_list)

UKHLS_indresp <- data.frame()



# Extract UKHLS datasets
for (file in path_list) {
  
  wave_count <- 18 + str_sub(paste(file), *, *) %>% str_remove_all("[w/]") %>% as.numeric() 
  
  data <- read_dta(paste(file)) # Import wave files
  
  data <- data %>% rename_all(funs(str_remove_all(.,"^.*?_"))) # Remove wave prefixes on all vars
  
  data <- data %>% select(pidp, # Core id variables
                          hiqual_dv, jbstat, fimngrs_dv, 
                          jbhas, jboff, jbnssec8_dv, 
                          starts_with("indinus"), 
                          starts_with("indinub")) # Keep longitudinal weights
  
  data <- data %>% mutate(wave = wave_count) %>% # Generate wave variable
                   mutate_if(haven::is.labelled, haven::as_factor) 
    
  UKHLS_indresp <- bind_rows(UKHLS_indresp, data)
  
  rm(data) # Drop junk file
  
}
  
  

# Bind the BHPS and UKHLS indresp files together
indresp <- bind_rows(BHPS_indresp, UKHLS_indresp)



# Start cleaning some key variables we need: (1) degree qualifications
indresp <- indresp %>% mutate(degree = case_when(hiqual_dv %in% c("missing", "Missing",
                                                                  "inapplicable") 
                                                                 ~ "Missing",
                                                 
                                                 hiqual_dv %in% c("Degree", "Other higher",
                                                                  "Other higher degree")
                                                                 ~ "Degree",
                                                 
                                                 TRUE ~ "No degree"))



# (2) Generate ILO employment status variable (employed) using jbhas and jboff
indresp <- indresp %>% mutate(jbhas_cleaned = case_when(jbhas %in% c("don't know", "inapplicable",
                                                                     "missing", "refusal") ~ "Missing",
                                                        
                                                        jbhas %in% c("no", "No") ~ "No",
                                                        
                                                        jbhas %in% c("yes", "Yes") ~ "Yes"),
                              
                              jboff_cleaned = case_when(jboff %in% c("don't know", "inapplicable",
                                                                     "missing", "refusal") ~ "Missing",
                                                        
                                                        jboff %in% c("no", "No", "Waiting for job",
                                                                     "waiting to take up job", "Waiting to take up job")
                                                        ~ "No",
                                                        
                                                        jboff %in% c("yes", "Yes") ~ "Yes"))



indresp <- indresp %>% mutate(employed = case_when(jbhas_cleaned == "Missing" | 
                                                   (jbhas_cleaned == "No" & jboff_cleaned == "Missing")
                                                   ~ "Missing",
                                                   
                                                   jbhas_cleaned == "Yes" | 
                                                   (jbhas_cleaned != "Missing" & jboff_cleaned == "Yes")
                                                   ~ "Employed",
                                                   
                                                   jbhas_cleaned == "No" & jboff_cleaned == "No"
                                                   ~ "Not employed"))



# (3) NSSEC of occupation for those currently working
indresp <- indresp %>% mutate(emp_type = case_when(employed == "Missing" ~ "Missing",
                                                           
                                                   employed == "Not employed" ~ "Not employed",
                                                           
                                                   employed == "Employed" & 
                                                   jbnssec8_dv %in% c("Higher professional",
                                                                      "Lower management & professional",
                                                                      "Large employers & higher management")
                                                                      ~ "Employed, manag/prof",
                                                           
                                                   employed == "Employed" &
                                                   jbnssec8_dv %in% c("Intermediate",
                                                                      "Lower supervisory & technical",
                                                                      "Routine",
                                                                      "Semi-routine",
                                                                      "Small employers & own account")
                                                                      ~ "Employed, other",
                                                           
                                                   employed == "Employed" &
                                                   jbnssec8_dv %in% c("don't know", 
                                                                      "inapplicable",
                                                                      "missing")
                                                                      ~ "Employed, no NSSEC"))



# (4) Individual income from all sources
indresp<- indresp%>% mutate(fimngrs_dv = as.numeric(as.character(fimngrs_dv)))

indresp <- indresp %>% mutate(indinc = ifelse(fimngrs_dv<0, 0, fimngrs_dv)) # Code negatives to be 0

summary(indresp$fimngrs_dv)
summary(indresp$indinc)



# Merge onto master person year file and tidy up
Person_file <-merge(Person_file, indresp,
                    by = c("pidp", "wave"),
                    all.x = T)


Person_file <- Person_file %>% select(pidp:kids, hiqual_dv, degree, jbstat,
                                      jbhas, jbhas_cleaned, jboff, jboff_cleaned,
                                      employed, jbnssec8_dv, emp_type,
                                      fimngrs_dv, indinc, 
                                      fihhmngrs_dv:region,
                                      lrwght:indinub_lw)


rm(indresp, BHPS_indresp, UKHLS_indresp, file, path_list, wave_count)



# The Person_file dataset contains 2,038,636 person-year observations for 
# everyone observed in either BHPS or UKHLS. Many of the rows are empty as the 
# person was not actually interviewed in that sweep.

# 61 variables are included on the base person-year file. Users can adapt the
# above code to add, discard and derive additional variables as required.

# If you use this file in your work please include a citation to the data 
# collection. Do get in touch if you find any problems or errors in the code.