# ------------------------------------------------
# Data_cleaning.R
# ------------------------------------------------
# Code provided as is and can be used or modified freely. This code only allows to used for academic purpose.
# ------------------------------------------------
# Author: Bin Chi, Adam Dennett
# UCL Centre for Advanced Spatial Analysis
# bin.chi.16@ucl.ac.uk ; a.dennett@ucl.ac.uk
# Date: 20/12/2019
# ------------------------------------------------

# Load functions
library("RPostgreSQL")
library("dplyr")
library("sqldf")
# Matchleft1 function returns the unmatched the records
matchleft1 <- function(x,y){
  next0 <- x[!(x$transactionid %in% y$transactionid),]
  
}


# Loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")
# Creates a connection to the datajournal database
con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456)

# Read transaction data, linked id data and EPC data from data linkage process
result2 <- dbGetQuery(con,"select * from casaresult2")
tran <- dbGetQuery(con,"select * from casatra20111")
epc <- dbGetQuery(con,"select * from epcformatch")

############### Clean up missing or untenable values ###############

## Clean up the linked data 
tran1<-merge(tran,result2,by="transactionid")
#6753307

tran2<-merge(tran1,epc,by="id")
# 6753307

# Transaction's total floor area or number of habitable rooms are NA value or 0.
pp1 <-subset(tran2,is.na(tran2$tfarea)|is.na(tran2$numberrooms)|tran2$numberrooms==0|tran2$tfarea==0)
dim(pp1)
#1016247
tran2 <- matchleft1(tran2,pp1)
dim(tran2)
#5737060  

# Transaction's total floor area is small than 9 m2 or bigger than 974 m2
pp2 <-  subset(tran2,tran2$tfarea<9 |tran2$tfarea>974)
dim(pp2)
###555

tran2 <- matchleft1(tran2,pp2)
dim(tran2)
#5736505
# Transaction's price per total floor area is bigger than 50000 £/m2  or transaction price per total floor area is small than 200 £/m2 .
tran2$priceper <- tran2$price/tran2$tfarea
pp3<- tran2[tran2$priceper>50000|tran2$priceper<200,]
dim(pp3)
#841
tran2 <- matchleft1(tran2,pp3)
dim(tran2)
#5735664
# Transaction's floor size per habitable room is bigger than 100 m2.
tran2$floorper <- tran2$tfarea/tran2$numberrooms
pp4 <- tran2[tran2$floorper>100,]
dim(pp4)
#887
tran2 <- matchleft1(tran2,pp4)
dim(tran2)
# 5734777

# Transaction's number of habitable rooms are bigger than 20.
pp5 <- tran2[tran2$numberrooms>20,]
dim(pp5)
#476

tran2 <- matchleft1(tran2,pp5)
dim(tran2)
##5734301

# Transaction's floor size per habitable room is smaller than 6.51 m2
pp6<- tran2[tran2$floorper<6.51,]
dim(pp6)
## 1463

tran2 <- matchleft1(tran2,pp6)
dim(tran2)
# 5732838 


# Combine the data with missing or untenable values
pp1$iid<-"pp1"
pp2$iid<-"pp2"
pp3$iid<-"pp3"
pp4$iid<-"pp4"
pp5$iid<-"pp5"
pp6$iid<-"pp6"


proepc<- sqldf("
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp1 UNION 
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp2 UNION 
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp3 UNION 
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp4 UNION 
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp5 UNION 
               select distinct transactionid,yearchi,tfarea,price,numberrooms,iid from pp6
               ", drv="SQLite", dbname=":memory:" )


con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456)
dbWriteTable(con, "casaproepc",value =proepc, append = TRUE, row.names = FALSE)

tran2 <- matchleft1(tran2,proepc)
dim(tran2)
# 5732838 , tran2 is the final linked data

## Renaming columns within the final linked data
colnames(tran2)[4]<-"postcode"
colnames(tran2)[7]<-"propertytype"
colnames(tran2)[19]<-"year"
colnames(tran2)[31]<-"postcode_epc"
colnames(tran2)[32]<-"propertytype_epc"
tran2<-tran2[,-38]

dbWriteTable(con, "transactioncasa",value =tran2, append = TRUE, row.names = FALSE)
# Delete the address string columns 
tran2011_19 <- subset(tran2, select = -c(address1, address2,address3,address,postcode_epc))
colnames(tran2011_19)[25]<-"rgn11nm"

dim(tran2011_19)
#5732838      32
##tran2011_19 is the linked dataset we created
write.csv(tran2011_19,"tran2011_19.csv",row.names=FALSE)
dbWriteTable(con, "transactioncasasimple",value =tran2011_19, append = TRUE, row.names = FALSE)

############ Note: Except the EPC fields we used before,  the rest of fields in EPCs can add in the linked data by using the common column id between two datasets  ############ 

tran2011_19<- dbGetQuery(con,"select * from transactioncasasimple")
epc <- dbGetQuery(con,"select * from data20190831id")


# Delete the address and postcode information from EPCs dataset
epc<- subset(epc, select = -c(add1, add2,add3,add,ADDRESS,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE))
# Save this non-address domestic EPC dataset as epc_id.csv. This is the epc_id.csv deposited in UKDA 
write.csv(epc,"epc_id.csv",row.names=FALSE)

# NOTE:We removed the address and postcode fields (which comes from EPC dataset) in the linked data to make sure this data is an open data. If you prefer to create a new linked data yourself, you need to understand and accept the licence and copyright for the data and then download. Please check your copyright notice when you use the address or postcode fields in the EPC dataset (https://epc.opendatacommunities.org/docs/copyright)
# Delete the columns exited in the tran2011_19 from EPCs to save memory
epc<- subset(epc, select = -c(INSPECTION_DATE, LODGEMENT_DATE,TOTAL_FLOOR_AREA, NUMBER_HABITABLE_ROOMS,PROPERTY_TYPE))


# Link the rest of fields in EPCs to the linked data
tranall<-merge(tran2011_19, epc,by="id")

# Save the data
dbWriteTable(con, "transactionall",value =tranall, append = TRUE, row.names = FALSE)
write.csv(tranall,"tranall2011_19.csv",row.names=FALSE)
#tranall2011_19.csv is the result of linked data. It has been deposited in UKDA