# ------------------------------------------------
# Evaluation.R
# ------------------------------------------------
# Code provided as is and can be used or modified freely. This code only allows to used for academic purpose.
# ------------------------------------------------
# Author: Bin Chi, Adam Dennett
# UCL Centre for Advanced Spatial Analysis
# bin.chi.16@ucl.ac.uk ; a.dennett@ucl.ac.uk
# Date: 20/12/2019
# ------------------------------------------------
######## function#######

# load functions
library("dplyr")
library("qdap")
library(data.table)
library(stringr)
library("ggplot2")
library("sqldf")
library("RPostgreSQL")
library("stringr")
library(tictoc)
library(profvis)
library("ggplot2")
library("entropy")
library("FNN")
library(ggrepel)
#uniquerresult function returns the successful linked research which is one to one match realtionship
uniqueresult <- function(x){
  
  dt <- as.data.table(x)
  
  esummary<-dt[,.(count=.N),by=transactionid]
  
  idd1 <- esummary[esummary$count==1,]
  
  result1 <- x[x$transactionid %in% idd1$transactionid,]
  
  return(result1)
}


#doubleresult function returns the successful linked research which is one to many match realtionship
doubleresult <-  function(x){
  
  dt <- as.data.table(x)
  
  esummary<-dt[,.(count=.N),by=transactionid]
  
  idd2 <- esummary[esummary$count!=1,]
  
  need1 <- x[x$transactionid %in% idd2$transactionid,]
  
  return(need1)
}

#matchleftfunction returns the unmatched the records with 12 specific variables
matchleft <- function(x,y){
  next0 <- x[!(x$transactionid %in% y$transactionid),c("transactionid","price","dateoftransfer","postcode","propertytype","oldnew","duration","paon","saon","street","locality","towncity")]
  return(next0)
  
}
#matchleft1 function returns the unmatched the records
matchleft1 <- function(x,y){
  next0 <- x[!(x$transactionid %in% y$transactionid),]
  
}

#matchleft function returns the matched the records with 12 specific variables
tranneed<- function(x,y){
  next0 <- x[(x$transactionid %in% y$transactionid),c("transactionid","price","dateoftransfer","postcode","propertytype","oldnew","duration","paon","saon","street","locality","towncity")]
  return(next0)
  
}


# numextract function extracts first number in the string.  
numextract <- function(string){ 
  str_extract(string, "\\-*\\d+\\.*\\d*")
} 

## checkre1 returns the first five records with 17 specific variables
Utilizingcheckre1 <- function(ta)
{a1<- ta[1:5,c("transactionid","dateoftransfer","postcode.x","saon","paon","street","locality","id","add","add1","add2","add3","propertytype.x","propertytype.y","tfarea","inspectiondate","addressf")]
return(a1)
}



# ------------------------------------------------
###############read in data from postGIS######

#loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")
#creates a connection to the datajournal  database
con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456)

#read the linked data from data linkage process
result2 <- dbGetQuery(con,"select * from casaresult2")


#read in original Land Registry data 
tran <- dbGetQuery(con,"select * from pricepaid")
#select the linked EPC data
tran1 <- tran[tran$transactionid %in% result2$transactionid, ]

# ----------------------histogram plot of the linked data and original data after 2009------------------------
#select transactionid, price,yearchi columns from original data. yearchi column is the column which refers the year period for each transaction


tran11 <- tran[,c("transactionid","price","yearchi")]
dim(tran11)
# 24133118        3
#select transactionid, price,yearchi columns from linked EPC data.
tran12<- tran1[,c("transactionid","price","yearchi")]
dim(tran12)
#16846834        3


#caluate the orgianl data volumn for each year
groupoy<- group_by(tran11,yearchi)
summaryoy <- summarize(groupoy,counto= n())
#caluate the linked data volumn for each year
grouptoy1<- group_by(tran12,yearchi)
summarytoy1 <- summarize(grouptoy1, count= n())

#merge summaryoy and summarytoy1 and calculate the annual match rate. pp is annual match rate
sumyall <- merge(summaryoy,summarytoy1,by="yearchi",all.x=T)
sumyall$pp <- sumyall$count/sumyall$counto

#save it in the R folder
setwd("D:/R")
write.csv(sumyall,"sumyintialmatch.csv",row.names=FALSE)


##rename two column in sumyall, prepare for plotting the annual match rate
colnames(sumyall)[4]<-"Proportion"
colnames(sumyall)[1]<-"Year"

###############plot the match rate between 1995 and 2019###############
dev.new()
ggplot(data=sumyall, aes(y=Proportion, x=Year))+geom_point( aes(y=Proportion, x=Year),size=3.2)+geom_path(aes(y=Proportion, x=Year),color="darkblue",size=1.6, lineend = "round")+
  theme_bw()+ scale_y_continuous(labels=scales::percent,limits = c(0.5, 1))+scale_x_continuous(limits = c(1995,2020),breaks = c(1995,2000,2005,2008,2010,2015,2019))+
  theme(axis.title = element_text(size=20), axis.text = element_text(size=18),strip.text = element_text(size=20),legend.text = element_text(size=18),legend.title = element_text(size=20),axis.title.y.right  = element_text(margin = margin(t = 0, r = 100, b = 0, l = 100)))+
  theme(legend.justification=c(1.05,0), legend.position=c(0.99,0.03))+ylab("Match rate")

# ----------------------end--------------------------

# ----------------------histogram plot before and after linkage  between 2009 and 2019--------------------------


tran11$P <- "Original data"
tran12$P <- "Linked data"

tran22 <- tran11[,c("price","yearchi","P")]
tran33 <- tran12[,c("price","yearchi","P")]

# this takes time to run
alldata <- rbind(tran22,tran33)

alldata$logprice <- log(alldata$price)   

#select the data for the period after 2009
alldata<-alldata[alldata$yearchi>2008,]
dim(alldata)
#16353895    4
data1<-alldata
rm(alldata)
###############plot histogram plot before and after linkage  between 2009 and 2019 ###############
dev.new()
# this plot takes time
ggplot(data1,aes(x=logprice,group=P,fill=P)) +geom_histogram(bins =100,color="black",position="identity",alpha=.5)+facet_wrap(~ yearchi,ncol = 2)+
  xlab("Log of house price")+
  theme_bw()+
  labs(fill = "Data")+
  scale_fill_manual(values = c( "#045a8d","#f7fcfd"))+
  theme(legend.position=c(0.80,0.10),axis.title = element_text(size=15), axis.text = element_text(size=12),strip.text = element_text(size=15),legend.text = element_text(size=12),legend.title = element_text(size=13))

rm(data1)
# ----------------------end--------------------------

# ----------------------quantify the information lost in the linked data by using K-S test and J-divergence method--------------------------

###############K-S test###############

#select the data after 2008
tran22<-tran22[tran22$yearchi>2008,]
tran33<-tran33[tran33$yearchi>2008,]

yearui <- unique(tran22$yearchi)
c<- 0

pc <- 0
resultts <- data.frame()
j <- 1
for(pc in yearui) 
  
{ 
  
  print(pc)
  autodata <- subset(tran22,yearchi==pc)
  autodata1 <- subset(tran33,yearchi==pc)
  c <- ks.test(autodata$price,autodata1$price)
  resultts[j,"year"] <-pc
  resultts[j,"D"] <- c$statistic
  resultts[j,"Pvalue"] <-c$p.value
  resultts[j,"alternative"] <- c$alternative
  resultts[j,"method"] <- c$method
  j <- j+1
}

#save the K-S test result 
write.csv(resultts, file = "resultks.csv",row.names=FALSE)


###############J-divergence###############


#fuction 
tabchi3 <- function(A,B)
{
  maxc <- max(log(A$price))
  
  xx <- round(maxc,0)+1
  ra <- (xx-0)/100
  groupnumber <- c(1:100)
  m <- 0
  n <- 0
  j <- 1
  
  
  result1 <- data.frame()
  
  for( j in groupnumber)
    
  {
    print(j)
    n <- j*ra
    m <- n-ra
    print(n)
    print(m)
    
    A$loprice <- log(A$price)
    B$loprice <- log(B$price)
    
    data1<-A[A$loprice >= m & A$loprice<n,]
    data2<-B[B$loprice >= m & B$loprice<n,]
    result1[j,"id"] <-  paste("group",j,sep="")
    result1[j,"orginal"] <-  nrow(data1)
    result1[j,"link"] <-  nrow(data2)
    
  }
  
  result <- data.frame( or=result1$orginal,
                        link=result1$link,
                        row.names =result1$id )
  return(result )
  
}



###log price 100 group
yearui <- unique(tran22$yearchi)
pc <- 0
j <- 1

result100 <- data.frame()

for(pc in yearui) 
{ 
  print(pc)
  data1 <- subset(tran22,tran22$yearchi==pc)
  data2 <- subset(tran33,tran33$yearchi==pc)
  data3 <- tabchi3(data1,data2)
  data3 <- data3[data3$or!=0,]
  CHIT <- chisq.test(data3)
  result100[j,"year"] <- pc
  result100[j,"xsquared"] <- CHIT$statistic
  result100[j,"df"] <- CHIT$parameter
  result100[j,"pvaule"] <- CHIT$p.value
  result100[j,"method"] <- CHIT$method
  
  data3[data3$link==0,"link"] <- 1
  
  
  result100[j,"J"] <- KL.plugin(data3$or,data3$link)+ KL.plugin(data3$link,data3$or)
  
  j <- j+1
}
#save the J-divergence result
write.csv(result100, file = "result100.csv",row.names=FALSE)


##plot the K-S resutl and J-divergence

data1<-result100[,c("year","J")]
data2<-resultts[,c("year","D")]
totald<-merge(data1,data2,by="year")  
write.csv(totald, file = "totald.csv",row.names=FALSE)
colnames(totald)<-c("Year","J-divergence","Statistic D")
dev.new()
ggplot(data=totald, aes(x=Year))+ geom_line(aes(y =`Statistic D`),color="#377eb8",size=1.0)+geom_point(aes(y =totald$`Statistic D` ),color="#377eb8",size=2.5)+
  geom_line(aes(y =(totald$`J-divergence`* 100)),color="#ff7f00",size=1.0)+geom_point(aes(y =(totald$`J-divergence`* 100)),color="#ff7f00",size=2.5 )+
  scale_y_continuous(sec.axis = sec_axis(~ (.)*0.01,name = "J-divergence"))+
  theme_bw()+
  theme(axis.title = element_text(size=14), axis.text = element_text(size=13),strip.text.x = element_text(size=13),strip.text.y = element_text(size=13),legend.text = element_text(size=14),legend.title = element_text(size=14),axis.title.y.left = element_text(margin = margin(t = 0, r = 10, b = 0, l = 10),color="#377eb8"),axis.title.y.right = element_text(margin = margin(t = 0, r = 10, b = 0, l = 10),color="#ff7f00"))+
  scale_x_continuous(breaks = c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019))+
  ylab("Statistic D")

rm(totald)
rm(result100)
rm(resultts)

head(tran2011)
head(tran20111)

rm(tran33)
rm(tran22)
# ----------------------end--------------------------
# ----------------------Evaluation the linkage for the period between 2011 and 2019--------------------------

tran2011<-tran[tran$yearchi > 2010,]
#7249259      17
tran20111<-tran1[tran1$yearchi> 2010,]
#6753335      17
dim(tran20111)[1]/dim(tran2011)[1]
#6753335/7249259=0.9315897
dbWriteTable(con, "casatran2011",value = tran2011, append = TRUE, row.names = FALSE)
dbWriteTable(con, "casatran20111",value = tran20111, append = TRUE, row.names = FALSE)
###############calculate the  match rate for different property type###############
grouptypeo<- group_by(tran2011,propertytype)

summarytypeo <- summarize(grouptypeo,
                          counto= n())

grouptype1<- group_by(tran20111,propertytype)

summarytype1 <- summarize(grouptype1,
                          count1= n())
head(summarytypeo)
head(summarytype1)
sum <- merge(summarytypeo,summarytype1,by="propertytype")
sum$pro <- sum$count1/sum$counto

write.csv(sum,"sumproprtytype.csv",row.names=FALSE)
rm(sum)
############### evaluate the spatail match rate ###############
con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456)
nspl2019<- dbGetQuery(con,"select * from nspl2019")


nspl20191<-nspl2019[,c("pcds","oa11","lsoa11","msoa11")]
colnames(nspl20191)[1]<-"postcode"

dim(tran2011)
#7249259 

tra2011<-merge(tran2011,nspl20191,by="postcode")
dim(tra2011)
#7246258  
#7246258. lost 3001


dim(tran20111)
#6753335 
tra20111 <- merge(tran20111,nspl20191,by="postcode")
dim(tran20111)[1]-dim(tra20111)[1]
#6753307 28 lost

#read in the Output Area (2011) to Built-up Area Sub-division to Built-up Area to Local Authority District to Region (December 2011) Lookup in England and Wales table
#http://geoportal.statistics.gov.uk/datasets/output-area-2011-to-built-up-area-sub-division-to-built-up-area-to-local-authority-district-to-region-december-2011-lookup-in-england-and-wales
oa_la_region<-read.csv("Output_Area_2011_to_Builtup_Area_Subdivision_to_Builtup_Area_to_Local_Authority_District_to_Region_December_2011_Lookup_in_England_and_Wales.csv")
colnames(oa_la_region)<-c("oa11","BUASD11CD","BUASD11NM","BUA11CD","BUA11NM","laua","lad11nm","LAD11NMW ","gor","RGN11NM","RGN11NMW","ObjectId")
oa_la_region<-oa_la_region[,c("oa11","laua","lad11nm","gor","RGN11NM")]



tra2011<-merge(tra2011, oa_la_region,by="oa11" )
dim(tra2011)
#7246256

tra20111<-merge(tra20111,oa_la_region,by="oa11")
dim(tra20111)
#6753307  

dbWriteTable(con, "casatra2011",value = tra2011, append = TRUE, row.names = FALSE)
dbWriteTable(con, "casatra20111",value =tra20111, append = TRUE, row.names = FALSE)

head(tra20111)
head(tra20111)
#caluate the match rate for each local authority in England and Wales

groupoye1<- group_by(tra2011,laua)

summaryoye1 <- summarize(groupoye1,
                         counte= n())


grouptoy1e1<- group_by(tra20111,laua)

summarytoy1e1 <- summarize(grouptoy1e1,
                           countem= n())

sumye1<- merge(summaryoye1,summarytoy1e1,by="laua",all.x=T)
sumye1$ppe <- sumye1$countem/sumye1$counte

write.csv(sumye1,"sumyelaua1.csv",row.names=FALSE)

geo2 <- sqldf("select  distinct laua,lad11nm,RGN11NM from oa_la_region", drv="SQLite", dbname=":memory:" )
##add in the name of each local authority basing on laua
sumye1<-merge(sumye1,geo2,by="laua")

group11<- group_by(tra2011,laua,yearchi)

summary11 <- summarize(group11,
                       counto= n())

group12<- group_by(tra20111,laua,yearchi)

summary12 <- summarize(group12,
                       countl= n())

sum12<- merge(summary11, summary12, by=c("laua","yearchi"))
sum12$p1 <-sum12$countl/sum12$counto
##sum12 is the annual match rate for each local authority in England and Wales after 2010

sum12 <- merge(sum12,geo2,by="laua")
colnames(sum12)<-c("laua","Year","counto","countl","Match rate","la","Region")
write.csv(sum12,"sum12.csv",row.names=FALSE)

cc<-sum12[sum12$`Match rate`<0.8,]
c1<- as.data.frame(unique(cc$laua))
dim(c1)

t23<-sum12[sum12$laua %in% c1$`unique(cc$laua)`, ]
t24<-t23[t23$Year=="2019",]

colnames(t23)[6]<-"Local authority"

#plot the annual match rate for each local authority in England and Wales after 2010
dev.new()


ggplot(data =sum12, aes(x=Year, y=`Match rate`,group=la,))+geom_point(aes(x=Year, y=`Match rate`,group=la),color="grey")+theme_bw()+
  geom_line(aes(x=Year, y=`Match rate`,group=la),color="grey")+
  scale_y_continuous(breaks = c(0.40,0.50,0.60,0.70,0.80,0.90,1.00),labels=scales::percent_format(accuracy = 1))+
  scale_x_continuous(breaks = c(2001,2012,2013,2014,2015,2016,2017,2018,2019))+
  theme(legend.position="bottom",axis.title = element_text(size=14), axis.text = element_text(size=12),strip.text.x = element_text(size=12),strip.text.y = element_text(size=12),legend.text = element_text(size=12),legend.title = element_text(size=13))+
  geom_line(data=t23,aes(x=Year, y=`Match rate`,group=`Local authority`,color=`Local authority`))+geom_point(data=t23,aes(x=Year, y=`Match rate`,group=`Local authority`,color=`Local authority`))

# ----------------------end--------------------------