# ------------------------------------------------ # Data_cleaning.R # ------------------------------------------------ # Code provided as is and can be used or modified freely. This code only allows to used for academic purpose. # ------------------------------------------------ # Author: Bin Chi, Adam Dennett # UCL Centre for Advanced Spatial Analysis # bin.chi.16@ucl.ac.uk ; a.dennett@ucl.ac.uk # Date: 20/12/2019 # ------------------------------------------------ # Load functions library("RPostgreSQL") library("dplyr") library("sqldf") # Matchleft1 function returns the unmatched the records matchleft1 <- function(x,y){ next0 <- x[!(x$transactionid %in% y$transactionid),] } # Loads the PostgreSQL driver drv <- dbDriver("PostgreSQL") # Creates a connection to the datajournal database con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456) # Read transaction data, linked id data and EPC data from data linkage process result2 <- dbGetQuery(con,"select * from casaresult2") tran <- dbGetQuery(con,"select * from casatra20111") epc <- dbGetQuery(con,"select * from epcformatch") ############### Clean up missing or untenable values ############### ## Clean up the linked data tran1<-merge(tran,result2,by="transactionid") #6753307 tran2<-merge(tran1,epc,by="id") # 6753307 # Transaction's total floor area or number of habitable rooms are NA value or 0. pp1 <-subset(tran2,is.na(tran2$tfarea)|is.na(tran2$numberrooms)|tran2$numberrooms==0|tran2$tfarea==0) dim(pp1) #1016247 tran2 <- matchleft1(tran2,pp1) dim(tran2) #5737060 # Transaction's total floor area is small than 9 m2 or bigger than 974 m2 pp2 <- subset(tran2,tran2$tfarea<9 |tran2$tfarea>974) dim(pp2) ###555 tran2 <- matchleft1(tran2,pp2) dim(tran2) #5736505 # Transaction's price per total floor area is bigger than 50000 £/m2 or transaction price per total floor area is small than 200 £/m2 . tran2$priceper <- tran2$price/tran2$tfarea pp3<- tran2[tran2$priceper>50000|tran2$priceper<200,] dim(pp3) #841 tran2 <- matchleft1(tran2,pp3) dim(tran2) #5735664 # Transaction's floor size per habitable room is bigger than 100 m2. tran2$floorper <- tran2$tfarea/tran2$numberrooms pp4 <- tran2[tran2$floorper>100,] dim(pp4) #887 tran2 <- matchleft1(tran2,pp4) dim(tran2) # 5734777 # Transaction's number of habitable rooms are bigger than 20. pp5 <- tran2[tran2$numberrooms>20,] dim(pp5) #476 tran2 <- matchleft1(tran2,pp5) dim(tran2) ##5734301 # Transaction's floor size per habitable room is smaller than 6.51 m2 pp6<- tran2[tran2$floorper<6.51,] dim(pp6) ## 1463 tran2 <- matchleft1(tran2,pp6) dim(tran2) # 5732838 # Combine the data with missing or untenable values pp1$iid<-"pp1" pp2$iid<-"pp2" pp3$iid<-"pp3" pp4$iid<-"pp4" pp5$iid<-"pp5" pp6$iid<-"pp6" proepc<- sqldf(" select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp1 UNION select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp2 UNION select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp3 UNION select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp4 UNION select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp5 UNION select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp6 ", drv="SQLite", dbname=":memory:" ) con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456) dbWriteTable(con, "casaproepc",value =proepc, append = TRUE, row.names = FALSE) tran2 <- matchleft1(tran2,proepc) dim(tran2) # 5732838 , tran2 is the final linked data ## Renaming columns within the final linked data colnames(tran2)[4]<-"postcode" colnames(tran2)[7]<-"propertytype" colnames(tran2)[19]<-"year" colnames(tran2)[31]<-"postcode_epc" colnames(tran2)[32]<-"propertytype_epc" tran2<-tran2[,-38] dbWriteTable(con, "transactioncasa",value =tran2, append = TRUE, row.names = FALSE) # Delete the address string columns tran2011_19 <- subset(tran2, select = -c(address1, address2,address3,address,postcode_epc)) colnames(tran2011_19)[25]<-"rgn11nm" dim(tran2011_19) #5732838 32 ##tran2011_19 is the linked dataset we created write.csv(tran2011_19,"tran2011_19.csv",row.names=FALSE) dbWriteTable(con, "transactioncasasimple",value =tran2011_19, append = TRUE, row.names = FALSE) ############ Note: Except the EPC fields we used before, the rest of fields in EPCs can add in the linked data by using the common column id between two datasets ############ tran2011_19<- dbGetQuery(con,"select * from transactioncasasimple") epc <- dbGetQuery(con,"select * from data20190831id") # Delete the address and postcode information from EPCs dataset epc<- subset(epc, select = -c(add1, add2,add3,add,ADDRESS,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE)) # Save this non-address domestic EPC dataset as epc_id.csv. This is the epc_id.csv deposited in UKDA write.csv(epc,"epc_id.csv",row.names=FALSE) # NOTE:We removed the address and postcode fields (which comes from EPC dataset) in the linked data to make sure this data is an open data. If you prefer to create a new linked data yourself, you need to understand and accept the licence and copyright for the data and then download. Please check your copyright notice when you use the address or postcode fields in the EPC dataset (https://epc.opendatacommunities.org/docs/copyright) # Delete the columns exited in the tran2011_19 from EPCs to save memory epc<- subset(epc, select = -c(INSPECTION_DATE, LODGEMENT_DATE,TOTAL_FLOOR_AREA, NUMBER_HABITABLE_ROOMS,PROPERTY_TYPE)) # Link the rest of fields in EPCs to the linked data tranall<-merge(tran2011_19, epc,by="id") # Save the data dbWriteTable(con, "transactionall",value =tranall, append = TRUE, row.names = FALSE) write.csv(tranall,"tranall2011_19.csv",row.names=FALSE) #tranall2011_19.csv is the result of linked data. It has been deposited in UKDA