###########################################################################################
##### Replication code for                                                            #####
##### "Technocrats, reputation, and responsiveness in policy explanation", Governance #####
#####                                                                                 #####
##### Authors: Michele Scotto di Vettimo and Christel Koop                            #####
##### Publication DOI: https://doi.org/10.1111/gove.70103                             #####
###########################################################################################

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())
library(tidyverse) 
library(openxlsx)
library(stringr) 
library(tibble)
library(purrr)
library(ggpubr)
library(modelsummary)
library(vtable)
library(dotwhisker)
library(dplyr)
library(coefplot)
library(bbmle)
library(zoo)
library(xtable)

dfun <- function(object){with(object,sum((weights * residuals^2)[weights > 0])/df.residual)}

options(max.print=999999)

'%!in%' <- Negate('%in%')

################################################################################
##### read in data 
df <- readRDS('analysis_replication_data.rds') 

################################################################################
##### Figure 1.
# Frequency of speeches by year and average length in senteces

# left panel
fig_n <- df %>% group_by(year) %>% summarise(n=n()) %>%
  ggplot(aes(x=year, y=n)) +
  geom_bar(stat="identity",fill='#56B4E9',color='black') +
  labs(title="",fill='',x="",y="Number of speeches") +
  theme_bw() +
  scale_x_continuous(breaks=seq(2000, 2024, 4)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text=element_text(size=16, hjust = .6),
        axis.title=element_text(size=18),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

# right panel
fig_len <- df %>% group_by(year) %>% summarise(len=mean(num_sen)) %>%
        ggplot(aes(x=year, y=len)) +
        geom_bar(stat="identity",fill='#56B4E9',color='black') +
        labs(title="",fill='',x="",y="Average speech length (in sentences)") +
        theme_bw() +
        scale_x_continuous(breaks=seq(2000, 2024, 4)) +
        theme(axis.line = element_line(colour = "black"),
              axis.text=element_text(size=16, hjust = .6),
              axis.title=element_text(size=18),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              panel.border = element_blank(),
              panel.background = element_blank())

ggarrange(fig_n, fig_len)

################################################################################
##### Figure 2. Location and event type

source("figure_2.R")
ggarrange(loc_map,loc_bar,ncol = 2,widths = c(1,1.75))

################################################################################
##### Figure 3.
# Trend of dependent variables over time (dots and loess curve)

fig_fk <- ggplot(df,aes(x=date_delivered, y=avg_flekin_rev)) +
  geom_point(color='#0072B2') + 
  geom_smooth(method='loess',span=0.25,colour="black",fill='#0072B2') +
  labs(title="Accessible language",fill='',x="",y="Flesch-Kincaid Score (Reversed)") +
  theme_bw() +
  scale_x_continuous(breaks=seq(2000, 2024, 4)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text=element_text(size=16),
        axis.title=element_text(size=18),
        plot.title=element_text(size=16),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  annotate("text",x=as.Date('2000-06-01'),y=-8,label="George",size=5) +
  annotate("rect",xmin=as.Date('1997-06-01'),xmax=as.Date('2003-06-30'),ymin=-20,ymax=-7.5,alpha=.1) +
  annotate("text",x=as.Date('2008-06-01'),y=-8,label="King",size=5) +
  annotate("text",x=as.Date('2016-12-01'),y=-8,label="Carney",size=5) +
  annotate("rect",xmin=as.Date('2013-07-01'),xmax=as.Date('2020-03-15'),ymin=-20,ymax=-7.5,alpha=.1) +
  annotate("text",x=as.Date('2022-06-01'),y=-8,label="Bailey",size=5)

fig_pctrelat <- ggplot(df, aes(x=date_delivered, y=pct_relatable_sen)) +
        geom_point(color='#DD2461') + 
        geom_smooth(method='loess',span=0.25,colour="black",fill='#DD2461') +
        labs(title="People-centered language",fill='',x="",y="Share of people-centered sentences") +
        theme_bw() +
        scale_x_continuous(breaks=seq(2000, 2024, 4)) +
        theme(axis.line = element_line(colour = "black"),
              axis.text=element_text(size=16),
              axis.title=element_text(size=18),
              plot.title = element_text(size=16),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              panel.border = element_blank(),
              panel.background = element_blank()) +
        annotate("text",x=as.Date('2000-06-01'),y=73,label="George",size=5) +
        annotate("rect",xmin=as.Date('1997-06-01'),xmax=as.Date('2003-06-30'),ymin=0,ymax=75,alpha=.1) +
        annotate("text",x=as.Date('2008-06-01'),y=73,label="King",size=5) +
        annotate("text",x=as.Date('2016-12-01'),y=73,label="Carney",size=5) +
        annotate("rect",xmin=as.Date('2013-07-01'),xmax=as.Date('2020-03-15'),ymin=0,ymax=75,alpha=.1) +
        annotate("text",x=as.Date('2022-06-01'),y=73,label="Bailey",size=5)

ggarrange(fig_fk,fig_pctrelat,nrow = 2)
      
################################################################################
##### Figure 4.
# Box-plot of dependent variables by speaker (only 20 most frequent speakers)

fig_speaker_access <- df %>% 
  group_by(speaker_name) %>% mutate(n=n(),y=median(avg_flekin_rev)) %>% subset(n>19) %>%
  ggplot(aes(x=reorder(speaker_name,y),y=avg_flekin_rev)) +
  geom_boxplot(color='black',fill='#0072B2') +
  coord_flip() +
  labs(x="Speaker",y="Flesch-Kincaid Score (Reversed)") +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        axis.text=element_text(size=16),
        axis.title=element_text(size=18),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

fig_speaker_relat <- df %>% 
  group_by(speaker_name) %>% mutate(n=n(),y=median(pct_relatable_sen)) %>% subset(n>19) %>%
  ggplot(aes(x=reorder(speaker_name,y),y=pct_relatable_sen)) +
  geom_boxplot(color='black',fill='#DD2461') +
  coord_flip() +
  labs(x="",y="% of people-centered sentences") +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        axis.text=element_text(size=16),
        axis.title=element_text(size=18),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

ggarrange(fig_speaker_access, fig_speaker_relat)

################################################################################
##### Analysis

##### Read in media data (they have different subsets of media sentences)
media_data <- readRDS('speeches_media_coverage.rds')
names(media_data)

# create datasets with media coverage
dataset <- list()
for(m in names(media_data)){ dataset[[m]] <- merge(df, media_data[[m]][,setdiff(names(media_data[[m]]),'date_delivered')], by='text_id') }

################################################################################
### Main Regression models

# Define Main Formulas 

facc1 <- "avg_flekin ~ negative_coverage + abs_dis_lag1 + unemp_lag1"
facc2 <- paste(facc1,'role_governor + ext_dum + speech_cumul_log + gender', sep = ' + ')
facc3 <- paste(facc2,"event_type_rec + governorship", sep = ' + ')

fpcl1 <- "cbind(num_relatable_sen, num_non_relatable_sen) ~ negative_coverage + abs_dis_lag1 + unemp_lag1"
fpcl2 <- paste(fpcl1,'role_governor + ext_dum + speech_cumul_log + gender', sep = ' + ')
fpcl3 <- paste(fpcl2,"event_type_rec + governorship", sep = ' + ')

main_models <- list() # empty list to fill

### Models for accessible language 
### DV: (Reversed) Flesch-Kincaid score of speech

main_models[['(1)']] <- lm(formula = facc1, data = dataset[['performance']])
main_models[['(2)']] <- lm(formula = facc2, data = dataset[['performance']])
main_models[['(3)']] <- lm(formula = facc3, data = dataset[['performance']])

### Models for People-centered language
### DV: proportion of relatable sentences in speech

main_models[['(4)']] <- glm(formula = fpcl1, family = quasibinomial, data = dataset[['performance']])
main_models[['(5)']] <- glm(formula = fpcl2, family = quasibinomial, data = dataset[['performance']])
main_models[['(6)']] <- glm(formula = fpcl3, family = quasibinomial, data = dataset[['performance']])

#### Table 1
msummary(main_models, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
##### Split by external vs internal members

models_ext_vs_int <- list()

### Accessible language

models_ext_vs_int[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
              speech_cumul_log + gender + event_type_rec + governorship,
            data=dataset[['performance']] %>% subset(ext_dum=='internal'))

models_ext_vs_int[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
              speech_cumul_log + gender + event_type_rec + governorship,
            data=dataset[['performance']] %>% subset(ext_dum=='external'))

### People-centered language

models_ext_vs_int[['(3)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
              speech_cumul_log + gender + event_type_rec + governorship,
            family=quasibinomial,data=dataset[['performance']] %>% subset(ext_dum=='internal'))

models_ext_vs_int[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
              speech_cumul_log + gender + event_type_rec + governorship,
            family=quasibinomial,data=dataset[['performance']] %>% subset(ext_dum=='external'))

### Table C6
msummary(models_ext_vs_int, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
##### Coefficient plot
ma1 <- models_ext_vs_int[[1]] %>% tidy() %>%
  by_2sd(models_ext_vs_int[[1]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Internal members")
ma2 <- models_ext_vs_int[[2]] %>% tidy() %>% 
  by_2sd(models_ext_vs_int[[2]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "External members")
mr1 <- models_ext_vs_int[[3]] %>% tidy() %>% 
  by_2sd(models_ext_vs_int[[3]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Internal members")
mr2 <- models_ext_vs_int[[4]] %>% tidy() %>% 
  by_2sd(models_ext_vs_int[[4]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "External members")

### Accessibility
coefp1 <- rbind(ma1,ma2) %>% mutate(title='Accessible language') %>%
  dwplot(ci=.9,by_2sd=T,model_order = c('Internal members','External members')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.1,1.1)) +
  theme_bw()+theme(plot.title = element_text(size=10),
                   axis.text=element_text(size=10),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 10),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') + #labs(title='Accessible language') +
  facet_grid(.~title) +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Relatability
coefp2 <- rbind(mr1,mr2) %>%  mutate(title='People-centered language') %>%
  dwplot(ci=.9,by_2sd = T,model_order = c('Internal members','External members')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.1,1.1)) +
  theme_bw()+theme(plot.title = element_text(size=10),
                   axis.text=element_text(size=10),
                   axis.text.y=element_blank(),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 10),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') +
  facet_grid(.~title) +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Figure C2 (upper panel)
ggarrange(coefp1, coefp2)

################################################################################
### Split governor vs others

models_gov_vs_other <- list()

### Accessibility
models_gov_vs_other[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               speech_cumul_log + event_type_rec + governorship,
            data=dataset[['performance']] %>% subset(role_governor=='governor'))

models_gov_vs_other[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               speech_cumul_log + event_type_rec + governorship,
            data=dataset[['performance']] %>% subset(role_governor=='other'))

### Relatability
models_gov_vs_other[['(3)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + speech_cumul_log + event_type_rec + governorship,
            family=quasibinomial,data=dataset[['performance']] %>% subset(role_governor=='governor'))

models_gov_vs_other[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + speech_cumul_log + event_type_rec + governorship,
            family=quasibinomial,data=dataset[['performance']] %>% subset(role_governor=='other'))

### Table C7
msummary(models_gov_vs_other, fmt = fmt_decimal(digits = 2, pdigits = 3))

ma3<-models_gov_vs_other[[1]] %>% tidy() %>% 
  by_2sd(models_gov_vs_other[[1]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Governor")
ma4<-models_gov_vs_other[[2]] %>% tidy() %>% 
  by_2sd(models_gov_vs_other[[2]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Other speaker")
mr3<-models_gov_vs_other[[3]] %>% tidy() %>% 
  by_2sd(models_gov_vs_other[[3]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Governor")
mr4<-models_gov_vs_other[[4]] %>% tidy() %>% 
  by_2sd(models_gov_vs_other[[4]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "Other speaker")

### Accessibility
coefp3<-rbind(ma3,ma4) %>% mutate(title='Accessible language') %>%
  dwplot(ci=.9,by_2sd=T,model_order = c('Governor','Other speaker')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.1,1.1)) +
  theme_bw()+theme(plot.title = element_text(size=10),
                   axis.text=element_text(size=10),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 10),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') + #labs(title='Accessible language') +
  facet_grid(.~title) +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Relatability
coefp4<-rbind(mr3,mr4) %>%  mutate(title = 'People-centered language') %>%
  dwplot(ci=.9,by_2sd = T,model_order = c('Governor','Other speaker')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.1,1.1)) +
  theme_bw()+theme(plot.title = element_text(size=10),
                   axis.text=element_text(size=10),
                   axis.text.y=element_blank(),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 10),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') +
  facet_grid(.~title) +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Figure C2 (lower panel)
ggarrange(coefp3, coefp4)

################################################################################
### Pre vs post crisis analysis

models_pre_post_crisis <- list()

# relevel governorship so as to have same reference category for pre- and post-crisis years
dataset[['performance']] <- within(dataset[['performance']], governorship_rec <- relevel(governorship, ref = 2))

### Accessibility
models_pre_post_crisis[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + speech_cumul_log + event_type_rec + governorship_rec,
             data = dataset[['performance']] %>% subset(pre_crisis==1))

models_pre_post_crisis[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + speech_cumul_log + event_type_rec + governorship_rec,
             data = dataset[['performance']] %>% subset(post_crisis==1))

### Relatability
models_pre_post_crisis[['(3)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + 
                role_governor + ext_dum + speech_cumul_log + gender + speech_cumul_log + event_type_rec + governorship_rec,
              family = quasibinomial, data = dataset[['performance']] %>% subset(pre_crisis==1))

models_pre_post_crisis[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + 
                role_governor + ext_dum + speech_cumul_log + gender + speech_cumul_log + event_type_rec + governorship_rec,
              family = quasibinomial, data = dataset[['performance']] %>% subset(post_crisis==1))

### Table C4
msummary(models_pre_post_crisis, fmt = fmt_decimal(digits = 2, pdigits = 3))

ma5<-models_pre_post_crisis[[1]] %>% tidy() %>% 
  by_2sd(models_pre_post_crisis[[1]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "1997-2007")
ma6<-models_pre_post_crisis[[2]] %>% tidy() %>% 
  by_2sd(models_pre_post_crisis[[2]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "2009-2024")
mr5<-models_pre_post_crisis[[3]] %>% tidy() %>% 
  by_2sd(models_pre_post_crisis[[3]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "1997-2007")
mr6<-models_pre_post_crisis[[4]] %>% tidy() %>% 
  by_2sd(models_pre_post_crisis[[4]]$model) %>% filter(!grepl('event_type|governorship',term)) %>% mutate(model = "2009-2024")

### Accessibility
coefp5<-rbind(ma5,ma6) %>% 
  mutate(title='Accessible language') %>%
  dwplot(ci=.9,model_order = c('1997-2007','2009-2024')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       ext_dumexternal="External member",
                       role_governorgovernor="Governor",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.5,1.5)) +
  theme_bw()+theme(plot.title = element_text(size=20),
                   axis.text=element_text(size=20),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 20),
                   axis.title=element_text(size=20),
                   strip.text = element_text(size = 20),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') + #labs(title='Accessible language') +
  facet_grid(.~title) +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Relatability
coefp6<-rbind(mr5,mr6) %>%  mutate(title='People-centered language') %>%
  dwplot(ci=.9,model_order = c('1997-2007','2009-2024')) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage",
                       cpi_yoy_lag1 = "Inflation (lagged)",
                       abs_dis_lag1 = "Dist. infl. target (lagged)",
                       unemp_lag1 = "Unemployment (lagged)",
                       ext_dumexternal="External member",
                       role_governorgovernor="Governor",
                       speech_cumul_log = "Cumulative speeches (log)",
                       genderF = "Female speaker")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Standardized coefficient") + ylab("") + scale_x_continuous(limits = c(-1.5,1.5)) +
  facet_grid(.~title) +
  theme_bw()+theme(plot.title = element_text(size=20),
                   axis.text=element_text(size=20),
                   axis.text.y=element_blank(),
                   axis.title=element_text(size=20),
                   legend.title = element_blank(),
                   legend.text = element_text(size = 20),
                   strip.text = element_text(size = 20),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_blank(),
                   legend.position = 'bottom') +
  scale_color_manual(values = c('#000000','grey60'),guide = guide_legend(ncol=2,reverse=T))

### Figure 6
ggarrange(coefp5,coefp6, widths=c(5,2.75))

################################################################################
##### Supplementary material ##### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Speeches by year-event
df %>% group_by(year,event_type_rec) %>%
  summarise(n=n()) %>%
  ggplot(aes(fill=factor(event_type_rec), y=n, x=year)) + 
  geom_bar(position="stack", stat="identity") +
  labs(title="",fill='',x="Year",y="Number of speeches") + theme_bw()

df %>% group_by(year,event_type_rec) %>%
  summarise(n=n()) %>%
  ggplot(aes(fill=factor(event_type_rec), y=n, x=year)) + 
  geom_bar(position="fill", stat="identity") +
  labs(title="",fill='',x="Year",y="Share of speeches") + theme_bw()

df %>% group_by(year,role_cat) %>%
  summarise(n=n()) %>%
  ggplot(aes(fill=factor(role_cat), y=n, x=year)) + 
  geom_bar(position="stack", stat="identity") +
  labs(title="",fill='',x="Year",y="Number of speeches") + theme_bw()

df %>% group_by(year,role_cat) %>%
  summarise(n=n()) %>%
  ggplot(aes(fill=factor(role_cat), y=n, x=year)) + 
  geom_bar(position="fill", stat="identity") +
  labs(title="",fill='',x="Year",y="Share of speeches") + theme_bw()

#### Media coverage measures produced in text analysis code

#### Fine tuning of RoBERTA-v3 model

x <- read.csv('boe_rel_lang_deberta-v3-large_hp_search.csv') %>%
  select(Name,num_train_epochs,learning_rate,per_device_train_batch_size,
         eval.f1_macro,eval.accuracy,eval.kappa_score,eval.accuracy_balanced,
         eval.precision_macro,eval.recall_macro)

# model info: https://huggingface.co/michelescotto/boe_rel_lang_deberta-v3-large

fig_f1 <- ggplot(x, aes(num_train_epochs, eval.f1_macro, color = learning_rate, shape = factor(per_device_train_batch_size))) +
  geom_point(size=1.5) + 
  labs(title="",color='Learning rate',x="Number of training epochs",y="F1 Macro",shape='Batch size') +
  ylim(0.7,.9) + xlim(1,20) +
  theme_bw() +
  scale_colour_gradient(low = "red",high = "blue") +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        legend.position="right")

fig_bac <-  ggplot(x, aes(num_train_epochs, eval.accuracy_balanced, color = learning_rate, shape = factor(per_device_train_batch_size))) +
    geom_point(size=1.5) + 
    labs(title="",color='Learning rate',x="Number of training epochs",y="Balanced Accuracy",shape='Batch size') +
    ylim(0.7,.9) + xlim(1,20) +
    theme_bw() +
    scale_colour_gradient(low = "red",high = "blue") +
    theme(axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          legend.position="right")

fig_acc <- ggplot(x,aes(num_train_epochs,eval.accuracy,color=learning_rate, shape = factor(per_device_train_batch_size))) +
    geom_point(size=1.5) + 
    labs(title="",color='Learning rate',x="Number of training epochs",y="Accuracy", shape ='Batch size') +
    ylim(0.8,1) + xlim(1,20) +
    theme_bw() +
    scale_colour_gradient(low = "red",high = "blue") +
    theme(axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          legend.position="right")

fig_kappa<-ggplot(x,aes(num_train_epochs,eval.kappa_score,color=learning_rate, shape = factor(per_device_train_batch_size))) +
    geom_point(size=1.5) + 
    labs(title="",color='Learning rate',x="Number of training epochs",y="Kappa score",shape = 'Batch size') +
    ylim(0.55,.75) + xlim(1,20) +
    theme_bw() +
    scale_colour_gradient(low = "red",high = "blue") +
    theme(axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          legend.position="right")

### Figure B1
ggarrange(fig_f1,fig_bac,fig_kappa,fig_acc,ncol=2,nrow=2,common.legend = T) 

#####################################
### Section C - Additional Tables ###
#####################################

################################################################################
### descriptive statistics
desc <- dataset[['performance']] %>% dplyr::select(
  negative_coverage,
  avg_flekin_rev,avg_dale_chall,avg_fog,
  pct_relatable_sen,
  abs_dis_lag1, unemp_lag1,
  speech_cumul,speech_cumul_log,
  # factor variables
  role_governor,ext_dum,gender,
  event_type_rec,
  governorship,
  pre_crisis,
  post_crisis,
) %>%
  mutate(role_governor=as.integer(role_governor)-1,
         ext_dum=as.integer(ext_dum)-1,
         gender=as.integer(gender)-1)

### Table C1
st(desc,factor.counts = T,out='latex',
   title='Summary statistics',
   labels=c('Negative coverage',
            'Flesch-Kincaid Score (Reversed)','Dale-Chall Score','Cunning fog Index','People centered language',
            'Distance from inflation target','Unemployment rate',
            'Cumulative n. of speeches','Cumulative n. of speeches (log)',
            # factor variables
            'Governor','External MPC member','Female speaker',
            'Type of event','Governorship','Pre-crisis dummy','Post-crisis dummy'))

################################################################################
### Correlation matrix

corstarsl <- function(x){
  require(Hmisc)
  x <- as.matrix(x)
  R <- rcorr(x)$r
  p <- rcorr(x)$P
  
  ## define notions for significance levels; spacing is important.
  mystars <- ifelse(p < .01, "***", ifelse(p < .05, "** ", ifelse(p < .1, "* ", " ")))
  
  ## truncate the matrix that holds the correlations to three decimal
  R <- format(round(cbind(rep(-1.11, ncol(x)), R), 3))[,-1]
  
  ## build a new matrix that includes the correlations with their appropriate stars
  Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x))
  diag(Rnew) <- paste(diag(R), " ", sep="")
  rownames(Rnew) <- colnames(x)
  colnames(Rnew) <- paste(colnames(x), "", sep="")
  
  ## remove upper triangle
  Rnew <- as.matrix(Rnew)
  Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
  Rnew <- as.data.frame(Rnew)
  
  ## remove last column and return the matrix (which is now a data frame)
  Rnew <- cbind(Rnew[1:length(Rnew)-1])
  return(Rnew)
}

data_corr <- dataset[['performance']][, c('negative_coverage',
                                          'avg_flekin_rev','avg_dale_chall','avg_fog','pct_relatable_sen',
                                          'abs_dis_lag1','unemp_lag1',
                                          'speech_cumul','speech_cumul_log')]

corstarsl(data_corr)

corrmat <- xtable(corstarsl(data_corr), 
                  caption="Correlation matrix",
                  label="tab:corrmat", 
                  latex.environments="center")
### Table C2
print(corrmat)

################################################################################
### Main models with standardised coefficients

### Table C3
msummary(main_models,stars=c("*" = .1, "**" = .05, "***" = .01), standardize='posthoc')

#####################################
### Section D - Robustness checks ###
#####################################

################################################################################
### Main Regression models with technical dimension of reputation

technical_models <- list() 

### Models for accessible language 
### DV: (Reversed) Flesch-Kincaid score of speech

technical_models[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1,
             data=dataset[['technical']])

technical_models[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender,
             data=dataset[['technical']])

technical_models[['(3)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
             data=dataset[['technical']])

### Models for People-centered language
### DV: proportion of relatable sentences in speech

technical_models[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1,
              family=quasibinomial,data=dataset[['technical']])

technical_models[['(5)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender,
              family=quasibinomial,data=dataset[['technical']])

technical_models[['(6)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
              family=quasibinomial,data=dataset[['technical']])

### Table D4
msummary(technical_models, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Main models with inflation instead of distance from target

main_models_inflation <- list()

Main_models_inflation[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + cpi_yoy_lag1 + unemp_lag1,
             data=dataset[['performance']])

main_models_inflation[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + cpi_yoy_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender,
             data=dataset[['performance']])

main_models_inflation[['(3)']]<-
          lm(avg_flekin_rev ~ negative_coverage + cpi_yoy_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
             data=dataset[['performance']])

### Models for People-centered language
### DV: proportion of relatable sentences in speech

main_models_inflation[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + cpi_yoy_lag1 + unemp_lag1,
              family=quasibinomial,data=dataset[['performance']])

main_models_inflation[['(5)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + cpi_yoy_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender,
              family=quasibinomial,data=dataset[['performance']])

main_models_inflation[['(6)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + cpi_yoy_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
              family=quasibinomial,data=dataset[['performance']])

### Table D1
msummary(main_models_inflation, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Main models with various ways of capturing time

main_models_time_elems<-list() 

main_models_time_elems[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + year,
             data=dataset[['performance']])

main_models_time_elems[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + post_crisis,
             data=dataset[['performance']])

main_models_time_elems[['(3)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + year,
              family=quasibinomial,data=dataset[['performance']])

main_models_time_elems[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ negative_coverage + abs_dis_lag1 + unemp_lag1 + 
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + post_crisis,
              family=quasibinomial,data=dataset[['performance']])

### Table C5
msummary(main_models_time_elems, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Main models using all media sentences, not just performance

main_models_all_media <- list() # empty list to fill

main_models_all_media[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1,
             data=dataset[['all_bank']])

main_models_all_media[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender,
             data=dataset[['all_bank']])

main_models_all_media[['(3)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
             data=dataset[['all_bank']])

main_models_all_media[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1,
              family=quasibinomial,data=dataset[['all_bank']])

main_models_all_media[['(5)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender,
              family=quasibinomial,data=dataset[['all_bank']])

main_models_all_media[['(6)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
              family=quasibinomial,data=dataset[['all_bank']])

### Table D3
msummary(main_models_all_media, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Main models, context sentences as well

main_models_context_media <- list() # empty list to fill

main_models_context_media[['(1)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1,
             data=dataset[['performance_context']])

main_models_context_media[['(2)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender,
             data=dataset[['performance_context']])

main_models_context_media[['(3)']]<-
          lm(avg_flekin_rev ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
             data=dataset[['performance_context']])

main_models_context_media[['(4)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1,
              family=quasibinomial,data=dataset[['performance_context']])

main_models_context_media[['(5)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender,
              family=quasibinomial,data=dataset[['performance_context']])

main_models_context_media[['(6)']]<-
          glm(cbind(num_relatable_sen,num_non_relatable_sen) ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
              family=quasibinomial,data=dataset[['performance_context']])

### Table D2
msummary(main_models_all_media, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Accessibility models with alternative dvs

main_models_altern_acc<-list()

### DV: Dale-Chall measure
main_models_altern_acc[['(1A)']]<-
          lm(avg_dale_chall ~ negative_coverage + abs_dis_lag1 + unemp_lag1,
             data=dataset[['performance']])

main_models_altern_acc[['(2A)']]<-
          lm(avg_dale_chall ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + log(speech_cumul) + gender,
             data=dataset[['performance']])

main_models_altern_acc[['(3A)']]<-
          lm(avg_dale_chall ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + log(speech_cumul) + gender + event_type_rec + governorship,
             data=dataset[['performance']])

### DV: Cunning-Fog index
main_models_altern_acc[['(1B)']]<-
          lm(avg_fog ~ negative_coverage + abs_dis_lag1 + unemp_lag1,
             data=dataset[['performance']])

main_models_altern_acc[['(2B)']]<-
          lm(avg_fog ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + log(speech_cumul) + gender,
             data=dataset[['performance']])

main_models_altern_acc[['(3B)']]<-
          lm(avg_fog ~ negative_coverage + abs_dis_lag1 + unemp_lag1 +
               role_governor + ext_dum + log(speech_cumul) + gender + event_type_rec + governorship,
             data=dataset[['performance']])

### Table D5
msummary(main_models_altern_acc, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
### Poisson model with count of relatable sentences as alternative specification

main_models_pcl_alternative<-list()

main_models_pcl_alternative[['(1)']]<-
          lm(num_relatable_sen ~ 
               negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen,
             data=dataset[['performance']])

main_models_pcl_alternative[['(2)']]<-
          lm(num_relatable_sen ~ 
               negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen +
               role_governor + ext_dum + speech_cumul_log + gender,
             data=dataset[['performance']])

main_models_pcl_alternative[['(3)']]<-
          lm(num_relatable_sen ~ 
               negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen +
               role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
             data=dataset[['performance']])

main_models_pcl_alternative[['(4)']]<-
          glm(num_relatable_sen ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen,
              family=quasipoisson,data=dataset[['performance']])

main_models_pcl_alternative[['(5)']]<-
          glm(num_relatable_sen ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen +
                role_governor + ext_dum + speech_cumul_log + gender,
              family=quasipoisson,data=dataset[['performance']])

main_models_pcl_alternative[['(6)']]<-
          glm(num_relatable_sen ~ 
                negative_coverage + abs_dis_lag1 + unemp_lag1 + num_sen +
                role_governor + ext_dum + speech_cumul_log + gender + event_type_rec + governorship,
              family=quasipoisson,data=dataset[['performance']])

### Table D6
msummary(main_models_pcl_alternative, fmt = fmt_decimal(digits = 2, pdigits = 3))

################################################################################
#### Coefficient plot of negative coverage on different reputational targets

mod_all <- main_models_all_media[[6]] %>% tidy() %>% 
  filter(grepl('negative_coverage',term)) %>% mutate(model = "Bank\nin general")

mod_main <- main_models[[6]] %>% tidy() %>% 
  filter(grepl('negative_coverage',term)) %>% mutate(model = "Bank &\nPerformance dimension")

mod_tech <- technical_models[[6]] %>% tidy() %>% 
  filter(grepl('negative_coverage',term)) %>% mutate(model = "Bank &\nTechnical dimension")

### Figure 5
print(coefpall <- rbind(mod_main, mod_all, mod_tech) %>%  mutate(title='Effect on People-centered language') %>%
  dwplot(ci=.9,model_order = c('Bank\nin general','Bank &\nPerformance dimension',"Bank &\nTechnical dimension")) %>%
  relabel_predictors(c(negative_coverage = "Negative coverage")) +
  geom_vline(xintercept = 0, colour = "red", linetype = 2) + xlab("Coefficient") + ylab("") + scale_x_continuous(limits = c(-.1,.4)) +
  labs(color = 'Negative coverage about:') +
  theme_bw(base_size = 4) + theme(
    plot.title = element_text(size=16),
    axis.text=element_text(size = 14),
    axis.text.y=element_blank(),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 14),
    strip.text.x = element_text(size = 14),
    legend.text = element_text(size = 14),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    legend.position = 'top') +
  facet_grid(.~title))

