rm(list=ls())
library(lattice)
library(latticeExtra)
library(lme4)
library(Rcapture)
library(metafor)
library(reshape2)
library(MASS)
library(uuid)

load("HITs.RData")

# HITs.RData contains the data.table HITs, with colums:
#   WorkerId --- Each unique WorkerId has been swapped for a UUID
#   SubmitTime --- (Column from original MTurk Batch file)
#   WorkTimeInSeconds --- (Column from original MTurk Batch file)
#   filename --- The name of the MTurk Batch file
#   location.requirement --- Location requirement for the HIT (self report from the experimenter)
#   HIT.requirement --- HIT approval rate requirement (self report from the experimenter)
#   conditional --- Whether the experiment required participation in an earlier study (self report from the experimenter)
#   pay --- in dollars, stripped from the Reward column in the original MTurk Batch file
#   median.duration --- The median WorkTimeInSeconds for each batch
#   lab --- The surname of the experimenter supplying the data
# Number of HITs
nrow(HITs)
# Number of unique workers
length(unique(HITs$WorkerId))
# Number of batches
length(unique(HITs$filename))

# Time range of HITs
(  time.range <- range(HITs$SubmitTime, na.rm=TRUE)  )

HITs <- HITs[,lab.name:=as.factor(lab)]
(  date.plot <- xyplot(factor(lab.name, levels=rev(levels(lab.name))) ~ SubmitTime, data=HITs, type=c("p", "g"), pch=".", jitter.y=TRUE, xlab="Date", ylab="Laboratory", factor=1.7, col="black")  )

HITs <- HITs[is.na(HIT.requirement),HIT.requirement:=0]
(  hit.plot <- histogram(~as.factor(HIT.requirement) | lab.name, data=HITs, layout=c(7,1), scales=list(alternating=FALSE, x=list(rot=90)), xlab="HIT Rate Requirement / %")  )

(  location.plot <- histogram(~factor(location.requirement, levels=c("", "UNITED STATES", "INDIA", "AU, NZ, GB, UNITED STATES"), labels=c("None", "US", "India", "AU/NZ/GB/US")) | lab.name, data=HITs, layout=c(7,1), scales=list(alternating=FALSE, x=list(rot=90)), xlab="Location Requirement")  )

HITs <- HITs[,median.duration:=median(WorkTimeInSeconds),by=filename]
HITs <- HITs[,median.payrate:=pay/median.duration*60*60] # Dollars per hour
median.pay.by.duration <- HITs[,.(median.duration=median(WorkTimeInSeconds), N=.N, pay=median(pay)),by=.(filename,lab.name)]

# Median duration in minutes
median(HITs$WorkTimeInSeconds)/60
# Median pay
median(HITs$pay)
# Median hourly pay
median(with(HITs, pay/WorkTimeInSeconds*60*60))

(  pay.by.duration.plot <- xyplot(pay~median.duration/60 | lab.name, data=median.pay.by.duration, cex=sqrt(median.pay.by.duration$N)/10, scales=list(alternating=FALSE, relation="free", y=list(rot=0)), layout=c(7,1), xlab="Median Duration / Minutes", ylab="Median Pay / $", col="black", type=c("p", "g")) + layer(panel.abline(a=0, b=7.25/60, lty=2))  )

# Figure 1
h <- .2
pdf("lab_details_2.pdf", height=14, width=14*210/297*1.2)
print(date.plot, position=c(0,3*h,1,1))
print(hit.plot, position=c(0,2*h,1,3*h), newpage=FALSE)
print(location.plot, position=c(0,1*h,1,2*h), newpage=FALSE)
print(pay.by.duration.plot, position=c(0,0*h,1,1*h), newpage=FALSE)
dev.off()
# Back up HITs data.table for later sections
HITs.original <- HITs

# Add which year quarter
HITs <- HITs[, quarter:=cut(SubmitTime, "quarter")]

# data.table of workers in each 
workers.per.batch <- HITs[,.(no.workers=length(unique(WorkerId)), no.HITs=.N),by=.(lab,filename)]
workers.per.batch <- workers.per.batch[,HITs.per.worker:=no.HITs/no.workers]

# Batches allowing multiple submissions
multiple.response.filenames <- workers.per.batch[HITs.per.worker>1.1]$filename
HITs <- HITs[, multiple.responses:=ifelse(filename%in%multiple.response.filenames,"Yes", "No")]

#########################################################
# Restrict to open experiments without multiple.responses
#########################################################
HITs <- HITs[conditional=="Open" & multiple.responses=="No"]
nrow(HITs)/nrow(HITs.original)
# The All-Labs estimate

cap.recap.openp <- function(HITs, lab=NA, ...) {
	# Wrapper to run open-population analysis with descriptive(), capture histories, and openp()
	# HITs is a data.frame with one row per capture, with columns for WorkerId and quarter
	capture.histories <- xtabs(~ WorkerId + quarter, data=HITs)
	capture.histories[capture.histories>1] <- 1
	capture.histories <- capture.histories[,colSums(capture.histories)>0] # Delete columns for occasions when no one is caught
	results <- list(periods=colnames(capture.histories))
	results$descriptive=descriptive(capture.histories)
	if(lab=="get.from.HITs.data.table")
        results$lab <- HITs$lab[1]
	else
		results$lab <- lab
	if(ncol(capture.histories)>3) {
		results$openp <- openp(capture.histories, ...)
		#if(!missing(keep))
		#	results$capture.history.freqs <- cbind(histpos.t(ncol(capture.histories)), results$openp$glm$model$Y)
	}
	return(results)
}

# Run the open-population analysis on data from all laboratories
(  op.all <- cap.recap.openp(HITs, lab="All Labs")  )
plot(op.all$openp)
plot(op.all$descriptive)

openp.df <- function(op) {
	# Convert openp() output to data.frame with confidence intervals
	add.CIs <- function(d, type, periods) {
		d <- as.data.frame(d)
		d$type <- type
		d$lower <- with(d, estimate - qnorm(0.975) * stderr)
		d$upper <- with(d, estimate + qnorm(0.975) * stderr)
		if(type %in% c("Survival Probability","New Workers")) {
			d$period <- periods[-1]
			d$period <- as.POSIXct(d$period, "%Y-%m-%d")
			d$period <- d$period + 3600*24*45
		} else {
			d$period <- periods
			d$period <- as.POSIXct(d$period, "%Y-%m-%d")
		}
		d
	}
	capture.probs <- add.CIs(op$openp$capture.prob, "Capture Probability", op$periods)
	survival.probs <- add.CIs(op$openp$survivals, "Survival Probability", op$periods)
	new.arrivals <- add.CIs(op$openp$birth, "New Workers", op$periods)
	abundance <- add.CIs(op$openp$N, "Worker Population", op$periods)

	d <- rbind(capture.probs, survival.probs, new.arrivals, abundance)
	d$lab <- op$lab
	d
}

qs <- as.POSIXct(unique(HITs$quarter), "%Y-%m-%d")
combineLimits(useOuterStrips(segplot(period ~ lower + upper | "All Labs" + type, centers=estimate, data=openp.df(op.all), horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free"), x=list(rot=90), alternating=FALSE), type="b", ylim=list(c(0,1), c(0,12000), c(0,1), c(0,25000)), xlim=time.range) + layer(panel.abline(h=c(seq(0,1,.25),seq(0,25000,2500)), alpha=0.1))) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red")))

# Mean population size estimate over periods
mean(op.all$openp$N[,"estimate"], na.rm=TRUE)
# Separate estimates for each lab

# Use by() to run the open-population analysis separately for each lab
op <- by(HITs, INDICES=list(HITs$lab), FUN=cap.recap.openp, lab="get.from.HITs.data.table")
# Print results for one lab
op$Bartels
op.df <- rbindlist(lapply(op, FUN=openp.df))

combineLimits(useOuterStrips(segplot(period ~ lower + upper | lab+type, centers=estimate, data=op.df, horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free"), x=list(rot=90), alternating=FALSE), ylim=rep(list(c(0,1), c(0,12000), c(0,1), c(0,20000)), each=7), xlim=time.range, type="b")) + layer(panel.abline(h=c(seq(0,1,.2),seq(0,25000,2500)), alpha=0.1)) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red")))
# Meta analysis to estimate for the average lab

# Just do the meta analysis for one estimate
workerpop <- op.df[type=="Worker Population"]
(  ma1 <- rma(y=estimate, sei=stderr, data=workerpop[period=="2013-01-01"])  )

# Now do the meta analysis for all estimates
do.rma <- function(data) {
	if(sum(!is.na(data$estimate))>1) {
		# Only do rma() on data with at least 2 non-NA observations
		ma1 <- rma(yi=estimate, sei=stderr, data=data)
		data.frame(type=data$type[1], period=data$period[1], estimate=ma1$b, se=ma1$se, I2=ma1$I2)
	} else
		NULL
}
# Use by() to run the random-effects meta analysis for each statistic for each period
estimates <- by(data=op.df, INDICES=list(op.df$type, op.df$period), do.rma)
estimates <- rbindlist(estimates)

# Median heterogeneity estimate
median(estimates[type=="Worker Population"]$I2)

# Add 95% CIs
estimates$lower <- with(estimates, estimate - qnorm(0.975) * se)
estimates$upper <- with(estimates, estimate + qnorm(0.975) * se)
estimates$type <- as.character(estimates$type)

useOuterStrips(segplot(period ~ lower + upper | "Average Lab" + type, centers=estimate, data=estimates, horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free"), x=list(rot=90), alternating=FALSE), type="b", ylim=list(c(0,1), c(0,12000), c(0,1), c(0,25000)), xlim=time.range)) + layer(panel.abline(h=c(seq(0,1,.25),seq(0,25000,2500)), alpha=0.1)) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red"))

estimates$lab <- "Average Lab"
# Plot all together for Figure 2
all.combined <- rbind(openp.df(op.all), op.df, estimates, fill=TRUE)

(  all.data.op.plot <- combineLimits(useOuterStrips(segplot(period ~ lower + upper | lab+type, centers=estimate, data=all.combined, horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free", rot=0), x=list(rot=90), alternating=FALSE), ylim=rep(list(c(0,1), c(0,8000), c(0,1), c(0,25000)), each=9), xlim=time.range, type="b")) + layer(panel.abline(h=c(seq(0,1,.25),seq(0,25000,2500)), alpha=0.1)) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red")))  )

pdf("open_population_plot.pdf", width=12, height=8)
all.data.op.plot
dev.off()

# Means over time
# Includes the mean over time of the population estimate for the average lab headlined in the title of the paper
(  est <- estimates[, .(mean.over.time=mean(estimate)), by=type]  )

mean.survival.prob <- est[type=="Survival Probability"]$mean.over.time
# Half life in months
log(0.5)/log(mean.survival.prob)/4*12
histogram(~median.payrate, data=HITs, breaks=0:1000-0.5, xlim=c(0,50))
HITs <- HITs[,pay.rate.quantile:=cut(median.payrate, quantile(HITs$median.payrate))]
op <- by(HITs, INDICES=list(HITs$pay.rate.quantile), FUN=cap.recap.openp, lab="get.from.HITs.data.table")
op.df <- rbindlist(lapply(op, FUN=openp.df))

op.df$lab <- rep(levels(HITs$pay.rate.quantile), each=47)

(  pay.openp <- combineLimits(useOuterStrips(segplot(period ~ lower + upper | lab+type, centers=estimate, data=op.df, horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free"), x=list(rot=90), alternating=FALSE), ylim=rep(list(c(0,1), c(0,7500), c(0,1), c(0,20000)), each=4), xlim=time.range, type="b")) + layer(panel.abline(h=c(seq(0,1,.2),seq(0,25000,2500)), alpha=0.1)) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red")))  )

# Figure 3
pdf("open_population_by_pay.pdf", width=8, height=8)
pay.openp
dev.off()
batch.size <- HITs[,.(batch.size=.N),by=filename]
batch.size.quantiles <- quantile(batch.size$batch.size)

HITs <- merge(HITs, batch.size, by="filename")

HITs <- HITs[,batch.size.quantile:=cut(batch.size, batch.size.quantiles)]
op <- by(HITs, INDICES=list(HITs$batch.size.quantile), FUN=cap.recap.openp, lab="get.from.HITs.data.table")
op.df <- rbindlist(lapply(op, FUN=openp.df))

op.df$lab <- rep(levels(HITs$batch.size.quantile), each=50)
op.df$lab <- factor(op.df$lab, levels=levels(HITs$batch.size.quantile), labels=c("(1,30]", "(30,100]", "(100,200]", "(200,2710]"))
# Figure 4
(  batch.size.openp <- combineLimits(useOuterStrips(segplot(period ~ lower + upper | lab+type, centers=estimate, data=op.df, horizontal=FALSE, xlab="Quarter", ylab="Estimate", scales=list(y=list(relation="free"), x=list(rot=90), alternating=FALSE), ylim=rep(list(c(0,1), c(0,7500), c(0,1), c(0,20000)), each=4), xlim=time.range, type="b")) + layer(panel.abline(h=c(seq(0,1,.2),seq(0,25000,2500)), alpha=0.1)) + layer(panel.abline(v=qs, alpha=0.1)) + layer(panel.abline(h=10000, col="red")))  )

pdf("open_population_by_batch_size.pdf", width=8, height=8)
batch.size.openp
dev.off()

op.df[type=="Worker Population", mean(estimate, na.rm=TRUE), by=lab]
# 95% CIs for averages over time
op.df.av <- op.df[type=="Worker Population", .(estimate=mean(estimate, na.rm=TRUE), stderr=sqrt(sum(stderr^2, na.rm=TRUE))/sum(!is.na(stderr))) ,by=lab]
op.df.av <- op.df.av[,lower.CI:=estimate-qnorm(0.975)*stderr]
op.df.av <- op.df.av[,upper.CI:=estimate+qnorm(0.975)*stderr]
op.df.av
# Keeping only people caught fewer than 10 times
sum(op.all$descriptive$base.freq[,"ui"][10:13])/op.all$descriptive$n # Proportion of workers caught more than 10 times
keep <- apply(histpos.t(13),1,sum)<10
# Run open-population analysis only with workers caught fewer than 10 times
op.all.fewer.than.10 <- cap.recap.openp(HITs, lab="All", keep=keep) 

op.all$openp$N[,"estimate"]
mean(op.all$openp$N[,"estimate"], na.rm=TRUE)
mean(op.all$openp$birth[, "estimate"], na.rm = TRUE)
op.all.fewer.than.10$openp$N[,"estimate"]
mean(op.all.fewer.than.10$openp$N[,"estimate"], na.rm=TRUE)
# US workers with a HIT acceptance rate requirement of greater than 80%
HITs <- HITs.original
# As before, but also only UNITED STATES and high HIT requirements
HITs <- HITs[conditional=="Open" & multiple.responses=="No" & location.requirement=="UNITED STATES" & HIT.requirement>50]
# Fraction remaining compared to original analysis
nrow(HITs)/nrow(HITs.original)

(  op.all <- cap.recap.openp(HITs, lab="All Labs")  )
mean(op.all$openp$N[,"estimate"], na.rm=TRUE)
HITs <- HITs.original

# The distribution of the number of other batches completed within a laboratory

# Add a column to HITs for the number of batches completed by each worker
HITs <- HITs[,N.batches:=.N,by=.(WorkerId)]
# ... and for within each lab
HITs <- HITs[,N.batches.within.lab:=.N,by=.(WorkerId,lab)]

HITs.all.labs <- HITs
HITs.all.labs$lab <- "All labs"
HITs.all.labs <- rbind(HITs, HITs.all.labs)

# Figure 5
(  no.batches.plot <- histogram(~(N.batches.within.lab-1) | lab, breaks=(-1):1000+0.5, xlim=c(-1,20), data=HITs.all.labs, scales=list(alternating=FALSE), as.table=TRUE, layout=c(8,1), xlab="Number of Other Batches Completed", ylab="Proportion of HITs", type="density")  )

pdf("no_batches_plot.pdf", width=12, height=4)
no.batches.plot
dev.off()

round(prop.table(xtabs(~N.batches.within.lab, data=HITs.all.labs[lab=="Bartels"])), digits=2)
round(prop.table(xtabs(~N.batches.within.lab, data=HITs.all.labs[lab=="All labs"])), digits=2)


# The distribution of the number of other laboratories visited

# No number of other labs participated in
workers.by.lab <- HITs[,WorkerId,by=.(WorkerId,lab)]
workers.by.lab <- workers.by.lab[,N:=.N,by=WorkerId]

# Figure 6
(  no.labs.plot <- histogram(~(N-1)|lab, data=workers.by.lab, type="density", breaks=(-1):6+0.5, layout=c(7,1), xlab="Number of Other Labs Visited", ylab="Proportion of Workers", scales=list(alternating=FALSE))  )
pdf("no_labs_plot.pdf", width=12, height=4)
no.labs.plot
dev.off()


# The joint distribution of worker and laboratory capture probabilities, together with marginal distributions
HITs <- HITs.original

lab.by.worker <- xtabs(~WorkerId+lab, data=HITs)
lab.by.worker[lab.by.worker>1] <- 1
freqs <- melt(lab.by.worker)
# Select a random sample of 100 workers for modelling, which means results will vary from the sample in the paper
selected.workers <- sample(unique(freqs$WorkerId), 100)
selected.freqs <- droplevels(subset(freqs, WorkerId%in%selected.workers))

mm1 <- glmer(value~(1|lab)+(1|WorkerId), data=selected.freqs, family=binomial)
summary(mm1)

x <- mvrnorm(1e5, rep(fixef(mm1),2), diag(VarCorr(mm1)))
logit <- function(x) { 1/(1+exp(-x)) }
x <- logit(x)
z <- kde2d(x=x[,1],y=x[,2], h=c(0.2,0.2),n=100)

joint.plot <- contourplot(z$z, row.values=z$x, column.values=z$x, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05), xlab="Worker Capture Probability", ylab="Laboratory Capture Probability")
worker.plot <- densityplot(~x[,1], plot.points=FALSE, xlab="Worker Capture Probability", xlim=c(-0.05,1.05))
lab.plot <- densityplot(~x[,2], plot.points=FALSE, xlab="Laboratory Capture Probability", xlim=c(-0.05,1.05))

# Figure 7
plot(worker.plot, split=c(2,2,2,2))
plot(joint.plot, split=c(2,1,2,2), newpage=FALSE)
plot(lab.plot, split=c(1,1,2,2), newpage=FALSE)

pdf("worker_capture_prob_density.pdf", width=4, height=4)
worker.plot
dev.off()

pdf("lab_capture_prob_density.pdf", width=4, height=4)
lab.plot
dev.off()

pdf("joint_capture_prob_density.pdf", width=4, height=4)
joint.plot
dev.off()

logit(fixef(mm1)[1])
quantile(x[,1], c(0.025, 0.975))
quantile(x[,2], c(0.025, 0.975))