# ------------------------------------------------ # Evaluation.R # ------------------------------------------------ # Code provided as is and can be used or modified freely. This code only allows to used for academic purpose. # ------------------------------------------------ # Author: Bin Chi, Adam Dennett # UCL Centre for Advanced Spatial Analysis # bin.chi.16@ucl.ac.uk ; a.dennett@ucl.ac.uk # Date: 20/12/2019 # ------------------------------------------------ ######## function####### # load functions library("dplyr") library("qdap") library(data.table) library(stringr) library("ggplot2") library("sqldf") library("RPostgreSQL") library("stringr") library(tictoc) library(profvis) library("ggplot2") library("entropy") library("FNN") library(ggrepel) #uniquerresult function returns the successful linked research which is one to one match realtionship uniqueresult <- function(x){ dt <- as.data.table(x) esummary<-dt[,.(count=.N),by=transactionid] idd1 <- esummary[esummary$count==1,] result1 <- x[x$transactionid %in% idd1$transactionid,] return(result1) } #doubleresult function returns the successful linked research which is one to many match realtionship doubleresult <- function(x){ dt <- as.data.table(x) esummary<-dt[,.(count=.N),by=transactionid] idd2 <- esummary[esummary$count!=1,] need1 <- x[x$transactionid %in% idd2$transactionid,] return(need1) } #matchleftfunction returns the unmatched the records with 12 specific variables matchleft <- function(x,y){ next0 <- x[!(x$transactionid %in% y$transactionid),c("transactionid","price","dateoftransfer","postcode","propertytype","oldnew","duration","paon","saon","street","locality","towncity")] return(next0) } #matchleft1 function returns the unmatched the records matchleft1 <- function(x,y){ next0 <- x[!(x$transactionid %in% y$transactionid),] } #matchleft function returns the matched the records with 12 specific variables tranneed<- function(x,y){ next0 <- x[(x$transactionid %in% y$transactionid),c("transactionid","price","dateoftransfer","postcode","propertytype","oldnew","duration","paon","saon","street","locality","towncity")] return(next0) } # numextract function extracts first number in the string. numextract <- function(string){ str_extract(string, "\\-*\\d+\\.*\\d*") } ## checkre1 returns the first five records with 17 specific variables Utilizingcheckre1 <- function(ta) {a1<- ta[1:5,c("transactionid","dateoftransfer","postcode.x","saon","paon","street","locality","id","add","add1","add2","add3","propertytype.x","propertytype.y","tfarea","inspectiondate","addressf")] return(a1) } # ------------------------------------------------ ###############read in data from postGIS###### #loads the PostgreSQL driver drv <- dbDriver("PostgreSQL") #creates a connection to the datajournal database con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456) #read the linked data from data linkage process result2 <- dbGetQuery(con,"select * from casaresult2") #read in original Land Registry data tran <- dbGetQuery(con,"select * from pricepaid") #select the linked EPC data tran1 <- tran[tran$transactionid %in% result2$transactionid, ] # ----------------------histogram plot of the linked data and original data after 2009------------------------ #select transactionid, price,yearchi columns from original data. yearchi column is the column which refers the year period for each transaction tran11 <- tran[,c("transactionid","price","yearchi")] dim(tran11) # 24133118 3 #select transactionid, price,yearchi columns from linked EPC data. tran12<- tran1[,c("transactionid","price","yearchi")] dim(tran12) #16846834 3 #caluate the orgianl data volumn for each year groupoy<- group_by(tran11,yearchi) summaryoy <- summarize(groupoy,counto= n()) #caluate the linked data volumn for each year grouptoy1<- group_by(tran12,yearchi) summarytoy1 <- summarize(grouptoy1, count= n()) #merge summaryoy and summarytoy1 and calculate the annual match rate. pp is annual match rate sumyall <- merge(summaryoy,summarytoy1,by="yearchi",all.x=T) sumyall$pp <- sumyall$count/sumyall$counto #save it in the R folder setwd("D:/R") write.csv(sumyall,"sumyintialmatch.csv",row.names=FALSE) ##rename two column in sumyall, prepare for plotting the annual match rate colnames(sumyall)[4]<-"Proportion" colnames(sumyall)[1]<-"Year" ###############plot the match rate between 1995 and 2019############### dev.new() ggplot(data=sumyall, aes(y=Proportion, x=Year))+geom_point( aes(y=Proportion, x=Year),size=3.2)+geom_path(aes(y=Proportion, x=Year),color="darkblue",size=1.6, lineend = "round")+ theme_bw()+ scale_y_continuous(labels=scales::percent,limits = c(0.5, 1))+scale_x_continuous(limits = c(1995,2020),breaks = c(1995,2000,2005,2008,2010,2015,2019))+ theme(axis.title = element_text(size=20), axis.text = element_text(size=18),strip.text = element_text(size=20),legend.text = element_text(size=18),legend.title = element_text(size=20),axis.title.y.right = element_text(margin = margin(t = 0, r = 100, b = 0, l = 100)))+ theme(legend.justification=c(1.05,0), legend.position=c(0.99,0.03))+ylab("Match rate") # ----------------------end-------------------------- # ----------------------histogram plot before and after linkage between 2009 and 2019-------------------------- tran11$P <- "Original data" tran12$P <- "Linked data" tran22 <- tran11[,c("price","yearchi","P")] tran33 <- tran12[,c("price","yearchi","P")] # this takes time to run alldata <- rbind(tran22,tran33) alldata$logprice <- log(alldata$price) #select the data for the period after 2009 alldata<-alldata[alldata$yearchi>2008,] dim(alldata) #16353895 4 data1<-alldata rm(alldata) ###############plot histogram plot before and after linkage between 2009 and 2019 ############### dev.new() # this plot takes time ggplot(data1,aes(x=logprice,group=P,fill=P)) +geom_histogram(bins =100,color="black",position="identity",alpha=.5)+facet_wrap(~ yearchi,ncol = 2)+ xlab("Log of house price")+ theme_bw()+ labs(fill = "Data")+ scale_fill_manual(values = c( "#045a8d","#f7fcfd"))+ theme(legend.position=c(0.80,0.10),axis.title = element_text(size=15), axis.text = element_text(size=12),strip.text = element_text(size=15),legend.text = element_text(size=12),legend.title = element_text(size=13)) rm(data1) # ----------------------end-------------------------- # ----------------------quantify the information lost in the linked data by using K-S test and J-divergence method-------------------------- ###############K-S test############### #select the data after 2008 tran22<-tran22[tran22$yearchi>2008,] tran33<-tran33[tran33$yearchi>2008,] yearui <- unique(tran22$yearchi) c<- 0 pc <- 0 resultts <- data.frame() j <- 1 for(pc in yearui) { print(pc) autodata <- subset(tran22,yearchi==pc) autodata1 <- subset(tran33,yearchi==pc) c <- ks.test(autodata$price,autodata1$price) resultts[j,"year"] <-pc resultts[j,"D"] <- c$statistic resultts[j,"Pvalue"] <-c$p.value resultts[j,"alternative"] <- c$alternative resultts[j,"method"] <- c$method j <- j+1 } #save the K-S test result write.csv(resultts, file = "resultks.csv",row.names=FALSE) ###############J-divergence############### #fuction tabchi3 <- function(A,B) { maxc <- max(log(A$price)) xx <- round(maxc,0)+1 ra <- (xx-0)/100 groupnumber <- c(1:100) m <- 0 n <- 0 j <- 1 result1 <- data.frame() for( j in groupnumber) { print(j) n <- j*ra m <- n-ra print(n) print(m) A$loprice <- log(A$price) B$loprice <- log(B$price) data1<-A[A$loprice >= m & A$loprice= m & B$loprice 2010,] #7249259 17 tran20111<-tran1[tran1$yearchi> 2010,] #6753335 17 dim(tran20111)[1]/dim(tran2011)[1] #6753335/7249259=0.9315897 dbWriteTable(con, "casatran2011",value = tran2011, append = TRUE, row.names = FALSE) dbWriteTable(con, "casatran20111",value = tran20111, append = TRUE, row.names = FALSE) ###############calculate the match rate for different property type############### grouptypeo<- group_by(tran2011,propertytype) summarytypeo <- summarize(grouptypeo, counto= n()) grouptype1<- group_by(tran20111,propertytype) summarytype1 <- summarize(grouptype1, count1= n()) head(summarytypeo) head(summarytype1) sum <- merge(summarytypeo,summarytype1,by="propertytype") sum$pro <- sum$count1/sum$counto write.csv(sum,"sumproprtytype.csv",row.names=FALSE) rm(sum) ############### evaluate the spatail match rate ############### con <- dbConnect(drv, dbname = "datajournal",port=5432, user="postgres",password=123456) nspl2019<- dbGetQuery(con,"select * from nspl2019") nspl20191<-nspl2019[,c("pcds","oa11","lsoa11","msoa11")] colnames(nspl20191)[1]<-"postcode" dim(tran2011) #7249259 tra2011<-merge(tran2011,nspl20191,by="postcode") dim(tra2011) #7246258 #7246258. lost 3001 dim(tran20111) #6753335 tra20111 <- merge(tran20111,nspl20191,by="postcode") dim(tran20111)[1]-dim(tra20111)[1] #6753307 28 lost #read in the Output Area (2011) to Built-up Area Sub-division to Built-up Area to Local Authority District to Region (December 2011) Lookup in England and Wales table #http://geoportal.statistics.gov.uk/datasets/output-area-2011-to-built-up-area-sub-division-to-built-up-area-to-local-authority-district-to-region-december-2011-lookup-in-england-and-wales oa_la_region<-read.csv("Output_Area_2011_to_Builtup_Area_Subdivision_to_Builtup_Area_to_Local_Authority_District_to_Region_December_2011_Lookup_in_England_and_Wales.csv") colnames(oa_la_region)<-c("oa11","BUASD11CD","BUASD11NM","BUA11CD","BUA11NM","laua","lad11nm","LAD11NMW ","gor","RGN11NM","RGN11NMW","ObjectId") oa_la_region<-oa_la_region[,c("oa11","laua","lad11nm","gor","RGN11NM")] tra2011<-merge(tra2011, oa_la_region,by="oa11" ) dim(tra2011) #7246256 tra20111<-merge(tra20111,oa_la_region,by="oa11") dim(tra20111) #6753307 dbWriteTable(con, "casatra2011",value = tra2011, append = TRUE, row.names = FALSE) dbWriteTable(con, "casatra20111",value =tra20111, append = TRUE, row.names = FALSE) head(tra20111) head(tra20111) #caluate the match rate for each local authority in England and Wales groupoye1<- group_by(tra2011,laua) summaryoye1 <- summarize(groupoye1, counte= n()) grouptoy1e1<- group_by(tra20111,laua) summarytoy1e1 <- summarize(grouptoy1e1, countem= n()) sumye1<- merge(summaryoye1,summarytoy1e1,by="laua",all.x=T) sumye1$ppe <- sumye1$countem/sumye1$counte write.csv(sumye1,"sumyelaua1.csv",row.names=FALSE) geo2 <- sqldf("select distinct laua,lad11nm,RGN11NM from oa_la_region", drv="SQLite", dbname=":memory:" ) ##add in the name of each local authority basing on laua sumye1<-merge(sumye1,geo2,by="laua") group11<- group_by(tra2011,laua,yearchi) summary11 <- summarize(group11, counto= n()) group12<- group_by(tra20111,laua,yearchi) summary12 <- summarize(group12, countl= n()) sum12<- merge(summary11, summary12, by=c("laua","yearchi")) sum12$p1 <-sum12$countl/sum12$counto ##sum12 is the annual match rate for each local authority in England and Wales after 2010 sum12 <- merge(sum12,geo2,by="laua") colnames(sum12)<-c("laua","Year","counto","countl","Match rate","la","Region") write.csv(sum12,"sum12.csv",row.names=FALSE) cc<-sum12[sum12$`Match rate`<0.8,] c1<- as.data.frame(unique(cc$laua)) dim(c1) t23<-sum12[sum12$laua %in% c1$`unique(cc$laua)`, ] t24<-t23[t23$Year=="2019",] colnames(t23)[6]<-"Local authority" #plot the annual match rate for each local authority in England and Wales after 2010 dev.new() ggplot(data =sum12, aes(x=Year, y=`Match rate`,group=la,))+geom_point(aes(x=Year, y=`Match rate`,group=la),color="grey")+theme_bw()+ geom_line(aes(x=Year, y=`Match rate`,group=la),color="grey")+ scale_y_continuous(breaks = c(0.40,0.50,0.60,0.70,0.80,0.90,1.00),labels=scales::percent_format(accuracy = 1))+ scale_x_continuous(breaks = c(2001,2012,2013,2014,2015,2016,2017,2018,2019))+ theme(legend.position="bottom",axis.title = element_text(size=14), axis.text = element_text(size=12),strip.text.x = element_text(size=12),strip.text.y = element_text(size=12),legend.text = element_text(size=12),legend.title = element_text(size=13))+ geom_line(data=t23,aes(x=Year, y=`Match rate`,group=`Local authority`,color=`Local authority`))+geom_point(data=t23,aes(x=Year, y=`Match rate`,group=`Local authority`,color=`Local authority`)) # ----------------------end--------------------------