library(tidyverse)
library(sandwich)
library(lmtest)
library(olsrr)

setwd('/path/to/data')

focals_agg <- read_csv('focals.csv', na = c('NA', 'NaN'))

focals_agg <- focals_agg[which(!is.na(focals_agg$focal_gender)), ]
focals_agg <- focals_agg[which(!is.na(focals_agg$share_male_citing)), ]

focals_agg <- focals_agg %>% 
  select(focal_id, share_male_citing, focal_gender, FOCAL_PY, FOCAL_TEAMSIZE, avg_eval) %>%
  mutate(focal_age = 2019 - FOCAL_PY) %>%
  select(-FOCAL_PY)

focals_sections <- read_csv('focals_sections.csv') %>%
  distinct()

focals_sections_wide <- focals_sections %>%
  mutate(count = 1) %>%
  spread(key = section_id, value = count, fill = 0)

focals_agg_sections <- focals_agg %>%
  left_join(focals_sections_wide, by = c('focal_id' = 'focal_id'))

focals_agg_sections <- focals_agg_sections %>%
  select(-focal_id)

rm(focals_agg, focals_sections, focals_sections_wide)
gc()

focals_agg_sections$share_male_citing <- 
  100 * focals_agg_sections$share_male_citing

vars_to_exclude <- which(colnames(focals_agg_sections) %in% c('share_male_citing', 'focal_gender', 'citing_prob_n_f', 'citing_prob_n_m'))
focals_agg_sections <- focals_agg_sections[
  !apply(focals_agg_sections[, -vars_to_exclude], MARGIN = 1, FUN = function(x) {any(is.na(x))}), 
]

lm_base <- lm(
  share_male_citing ~ focal_gender,
  data = focals_agg_sections,
  na.action = na.omit
)

lm_sections <- lm(
  share_male_citing ~ . - focal_age - FOCAL_TEAMSIZE - avg_eval,
  data = focals_agg_sections,
  na.action = na.omit
)

lm_full <- lm(
  share_male_citing ~ .,
  data = focals_agg_sections,
  na.action = na.omit
)
  
# robust standard errors

base_robust <- 
  coeftest(lm_base, vcov. = vcovHC(lm_base, type = 'HC0'))
base_robust_rows <- 
  which(rownames(base_robust) %in% c(
    'focal_genderFM', 'focal_genderM'
  ))
m1_name <- paste('M1: no controls')
base_tibble <- 
  tibble(
    Variable = rownames(base_robust[base_robust_rows, ]),
    Coefficient = base_robust[base_robust_rows, 'Estimate'],
    SE = base_robust[base_robust_rows, 'Std. Error'],
    model_name = m1_name
  )

sections_robust <- 
  coeftest(lm_sections, vcov. = vcovHC(lm_sections, type = 'HC0'))
sections_robust_rows <- 
  which(rownames(sections_robust) %in% c(
    'focal_genderFM', 'focal_genderM'
  ))
m2_name <- paste('M2: controlling for keywords')
sections_tibble <- 
  tibble(
    Variable = rownames(sections_robust[sections_robust_rows, ]),
    Coefficient = sections_robust[sections_robust_rows, 'Estimate'],
    SE = sections_robust[sections_robust_rows, 'Std. Error'],
    model_name = m2_name
  )

full_robust <- 
  coeftest(lm_full, vcov. = vcovHC(lm_full, type = 'HC0'))
full_robust_rows <- 
  which(rownames(full_robust) %in% c(
    'focal_genderFM', 'focal_genderM', 'avg_eval', 'focal_age', 'FOCAL_TEAMSIZE'
  ))
m3_name <- paste('M3: full model')
full_tibble <- 
  tibble(
    Variable = rownames(full_robust[full_robust_rows, ]),
    Coefficient = full_robust[full_robust_rows, 'Estimate'],
    SE = full_robust[full_robust_rows, 'Std. Error'],
    model_name = m3_name
  )

f_fct_reordering <- function(var_names) {
  ifelse(var_names[1] == 'Male-authored focal paper', 6,
         ifelse(var_names[1] == 'Mixed-authored focal paper', 5,
                ifelse(var_names[1] == 'Keywords\n(334 binary variables)', 4,
                       ifelse(var_names[1] == 'Average quality rating', 3,
                              ifelse(var_names[1] == 'Age of paper', 2,
                                     ifelse(var_names[1] == 'Number of co-authors', 1
                                     ))))))
}

f_model_reordering <- function(model_names) {
  ifelse(model_names[1] == m1_name, 1, 
         ifelse(model_names[1] == m2_name, 2, 
                ifelse(model_names[1] == m3_name, 3)))
}

coefs_tibble <- base_tibble %>%
  bind_rows(sections_tibble) %>%
  bind_rows(full_tibble) %>%
  mutate(
    Variable = recode(
      Variable, 
      'focal_genderM' = 'Male-authored focal paper',
      'focal_genderFM' = 'Mixed-authored focal paper',
      'FOCAL_TEAMSIZE' = 'Number of co-authors',
      'focal_age' = 'Age of paper',
      'avg_eval' = 'Average quality rating'
    )
  ) %>%
  bind_rows(
    tibble(
      Variable = rep('Keywords\n(334 binary variables)', 2), 
      Coefficient = rep(NA, 2), 
      SE = rep(NA, 2), 
      model_name = c(m2_name, m3_name)
    )
  ) %>%
  mutate(
    Variable = fct_reorder(Variable, Variable, f_fct_reordering),
    model_name = fct_reorder(model_name, model_name, f_model_reordering)
  )

interval95 <- -qnorm((1 - 0.95) / 2)

m1 <- 'M1: no controls'
m2 <- 'M2: incl. keywords'
m3 <- 'M3: full model'
coefs_tibble$model_name2 <- c(m1, m1, m2, m2, m3, m3, m3, m3, m3, m2, m3)

coefs_tibble$Variable2 <- as.character(coefs_tibble$Variable)
coefs_tibble$Variable2[1] <- 'Mixed-authored\nfocal paper'
coefs_tibble$Variable2[3] <- 'Mixed-authored\nfocal paper'
coefs_tibble$Variable2[5] <- 'Mixed-authored\nfocal paper'
coefs_tibble$Variable2[2] <- 'Male-authored\nfocal paper'
coefs_tibble$Variable2[4] <- 'Male-authored\nfocal paper'
coefs_tibble$Variable2[6] <- 'Male-authored\nfocal paper'
coefs_tibble$Variable2 <- factor(coefs_tibble$Variable2)

f_fct_reordering2 <- function(var_names) {
  ifelse(var_names[1] == 'Male-authored\nfocal paper', 6,
         ifelse(var_names[1] == 'Mixed-authored\nfocal paper', 5,
                ifelse(var_names[1] == 'Keywords\n(334 binary variables)', 4,
                       ifelse(var_names[1] == 'Average quality rating', 3,
                              ifelse(var_names[1] == 'Age of paper', 2,
                                     ifelse(var_names[1] == 'Number of co-authors', 1
                                     ))))))
}
coefs_tibble <- coefs_tibble %>%
  mutate(Variable2 = fct_reorder(Variable2, Variable2, f_fct_reordering2))

coef_plot_focals_legend <- ggplot(coefs_tibble, colour = model_name2, aes(x = Variable2, y = Coefficient)) + 
  geom_hline(yintercept = 0, lty = 2, size = 1) +
  geom_point(
    aes(colour = model_name2, shape = model_name2), 
    position = position_dodge(width = -.7),
    size = 4
  ) +
  geom_errorbar(
    aes(ymin = Coefficient - SE * interval95, ymax = Coefficient + SE * interval95, colour = model_name2),
    width = 0, size = 1, position = position_dodge(width = -.7)
  ) +
  scale_color_manual(values = c('#009E73', '#0070C0', '#CC00CC')) +
  coord_flip() + 
  theme_bw() + 
  geom_vline(xintercept = 1.5, colour = 'grey') +
  geom_vline(xintercept = 2.5, colour = 'grey') +
  geom_vline(xintercept = 3.5, colour = 'grey') +
  geom_vline(xintercept = 4.5, colour = 'grey') +
  geom_vline(xintercept = 5.5, colour = 'grey') +
  theme(
    text = element_text(family = 'Helvetica'),
    axis.title.x = element_text(size = 16, colour = 'black'),
    axis.title.y = element_blank(),
    axis.text = element_text(size = 16, colour = 'black'),
    legend.text = element_text(size = 16, colour = 'black'),
    legend.title = element_blank(),
    legend.background = element_blank(),
    legend.box.background = element_rect(size = .2), # size .5 because this is the same size set by theme_bw for plot rect
    legend.position = c(0.65, 0.15),
    legend.key = element_blank(),
    legend.title.align = 0.5,
    axis.ticks = element_line(size = 1),
    axis.ticks.length = unit(-0.3, 'lines'),
    panel.grid.major.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  ) +
  labs(
    y = 'Increase in share of male-authored\nciting papers (percentage points)'
  ) +
  guides(
    colour = guide_legend(
      title = '',
      title.position = 'left',
      nrow = 3
    ),
    shape = guide_legend(
      title = '',
      title.position = 'left',
      nrow = 3
    )
  ) +
  annotate(
    'text', x = 4, size = 6,
    y = mean(range(coefs_tibble[, 'Coefficient'], na.rm = TRUE)),
    label = 'Included in M2', 
    colour = '#0070C0',
    vjust = -0.25
  ) + 
  annotate(
    'text', x = 4, size = 6,
    y = mean(range(coefs_tibble[, 'Coefficient'], na.rm = TRUE)),
    label = 'Included in M3', 
    colour = '#CC00CC',
    vjust = 1.25
  )

ggsave('res/coef_plot_focals_legend.png', plot = coef_plot_focals_legend, width = 16, height = 12, units = 'cm')