library(tidyverse) library(sandwich) library(lmtest) library(olsrr) setwd('/path/to/data') focals_agg <- read_csv('focals.csv', na = c('NA', 'NaN')) focals_agg <- focals_agg[which(!is.na(focals_agg$focal_gender)), ] focals_agg <- focals_agg[which(!is.na(focals_agg$share_male_citing)), ] focals_agg <- focals_agg %>% select(focal_id, share_male_citing, focal_gender, FOCAL_PY, FOCAL_TEAMSIZE, avg_eval) %>% mutate(focal_age = 2019 - FOCAL_PY) %>% select(-FOCAL_PY) focals_sections <- read_csv('focals_sections.csv') %>% distinct() focals_sections_wide <- focals_sections %>% mutate(count = 1) %>% spread(key = section_id, value = count, fill = 0) focals_agg_sections <- focals_agg %>% left_join(focals_sections_wide, by = c('focal_id' = 'focal_id')) focals_agg_sections <- focals_agg_sections %>% select(-focal_id) rm(focals_agg, focals_sections, focals_sections_wide) gc() focals_agg_sections$share_male_citing <- 100 * focals_agg_sections$share_male_citing vars_to_exclude <- which(colnames(focals_agg_sections) %in% c('share_male_citing', 'focal_gender', 'citing_prob_n_f', 'citing_prob_n_m')) focals_agg_sections <- focals_agg_sections[ !apply(focals_agg_sections[, -vars_to_exclude], MARGIN = 1, FUN = function(x) {any(is.na(x))}), ] lm_base <- lm( share_male_citing ~ focal_gender, data = focals_agg_sections, na.action = na.omit ) lm_sections <- lm( share_male_citing ~ . - focal_age - FOCAL_TEAMSIZE - avg_eval, data = focals_agg_sections, na.action = na.omit ) lm_full <- lm( share_male_citing ~ ., data = focals_agg_sections, na.action = na.omit ) # robust standard errors base_robust <- coeftest(lm_base, vcov. = vcovHC(lm_base, type = 'HC0')) base_robust_rows <- which(rownames(base_robust) %in% c( 'focal_genderFM', 'focal_genderM' )) m1_name <- paste('M1: no controls') base_tibble <- tibble( Variable = rownames(base_robust[base_robust_rows, ]), Coefficient = base_robust[base_robust_rows, 'Estimate'], SE = base_robust[base_robust_rows, 'Std. Error'], model_name = m1_name ) sections_robust <- coeftest(lm_sections, vcov. = vcovHC(lm_sections, type = 'HC0')) sections_robust_rows <- which(rownames(sections_robust) %in% c( 'focal_genderFM', 'focal_genderM' )) m2_name <- paste('M2: controlling for keywords') sections_tibble <- tibble( Variable = rownames(sections_robust[sections_robust_rows, ]), Coefficient = sections_robust[sections_robust_rows, 'Estimate'], SE = sections_robust[sections_robust_rows, 'Std. Error'], model_name = m2_name ) full_robust <- coeftest(lm_full, vcov. = vcovHC(lm_full, type = 'HC0')) full_robust_rows <- which(rownames(full_robust) %in% c( 'focal_genderFM', 'focal_genderM', 'avg_eval', 'focal_age', 'FOCAL_TEAMSIZE' )) m3_name <- paste('M3: full model') full_tibble <- tibble( Variable = rownames(full_robust[full_robust_rows, ]), Coefficient = full_robust[full_robust_rows, 'Estimate'], SE = full_robust[full_robust_rows, 'Std. Error'], model_name = m3_name ) f_fct_reordering <- function(var_names) { ifelse(var_names[1] == 'Male-authored focal paper', 6, ifelse(var_names[1] == 'Mixed-authored focal paper', 5, ifelse(var_names[1] == 'Keywords\n(334 binary variables)', 4, ifelse(var_names[1] == 'Average quality rating', 3, ifelse(var_names[1] == 'Age of paper', 2, ifelse(var_names[1] == 'Number of co-authors', 1 )))))) } f_model_reordering <- function(model_names) { ifelse(model_names[1] == m1_name, 1, ifelse(model_names[1] == m2_name, 2, ifelse(model_names[1] == m3_name, 3))) } coefs_tibble <- base_tibble %>% bind_rows(sections_tibble) %>% bind_rows(full_tibble) %>% mutate( Variable = recode( Variable, 'focal_genderM' = 'Male-authored focal paper', 'focal_genderFM' = 'Mixed-authored focal paper', 'FOCAL_TEAMSIZE' = 'Number of co-authors', 'focal_age' = 'Age of paper', 'avg_eval' = 'Average quality rating' ) ) %>% bind_rows( tibble( Variable = rep('Keywords\n(334 binary variables)', 2), Coefficient = rep(NA, 2), SE = rep(NA, 2), model_name = c(m2_name, m3_name) ) ) %>% mutate( Variable = fct_reorder(Variable, Variable, f_fct_reordering), model_name = fct_reorder(model_name, model_name, f_model_reordering) ) interval95 <- -qnorm((1 - 0.95) / 2) m1 <- 'M1: no controls' m2 <- 'M2: incl. keywords' m3 <- 'M3: full model' coefs_tibble$model_name2 <- c(m1, m1, m2, m2, m3, m3, m3, m3, m3, m2, m3) coefs_tibble$Variable2 <- as.character(coefs_tibble$Variable) coefs_tibble$Variable2[1] <- 'Mixed-authored\nfocal paper' coefs_tibble$Variable2[3] <- 'Mixed-authored\nfocal paper' coefs_tibble$Variable2[5] <- 'Mixed-authored\nfocal paper' coefs_tibble$Variable2[2] <- 'Male-authored\nfocal paper' coefs_tibble$Variable2[4] <- 'Male-authored\nfocal paper' coefs_tibble$Variable2[6] <- 'Male-authored\nfocal paper' coefs_tibble$Variable2 <- factor(coefs_tibble$Variable2) f_fct_reordering2 <- function(var_names) { ifelse(var_names[1] == 'Male-authored\nfocal paper', 6, ifelse(var_names[1] == 'Mixed-authored\nfocal paper', 5, ifelse(var_names[1] == 'Keywords\n(334 binary variables)', 4, ifelse(var_names[1] == 'Average quality rating', 3, ifelse(var_names[1] == 'Age of paper', 2, ifelse(var_names[1] == 'Number of co-authors', 1 )))))) } coefs_tibble <- coefs_tibble %>% mutate(Variable2 = fct_reorder(Variable2, Variable2, f_fct_reordering2)) coef_plot_focals_legend <- ggplot(coefs_tibble, colour = model_name2, aes(x = Variable2, y = Coefficient)) + geom_hline(yintercept = 0, lty = 2, size = 1) + geom_point( aes(colour = model_name2, shape = model_name2), position = position_dodge(width = -.7), size = 4 ) + geom_errorbar( aes(ymin = Coefficient - SE * interval95, ymax = Coefficient + SE * interval95, colour = model_name2), width = 0, size = 1, position = position_dodge(width = -.7) ) + scale_color_manual(values = c('#009E73', '#0070C0', '#CC00CC')) + coord_flip() + theme_bw() + geom_vline(xintercept = 1.5, colour = 'grey') + geom_vline(xintercept = 2.5, colour = 'grey') + geom_vline(xintercept = 3.5, colour = 'grey') + geom_vline(xintercept = 4.5, colour = 'grey') + geom_vline(xintercept = 5.5, colour = 'grey') + theme( text = element_text(family = 'Helvetica'), axis.title.x = element_text(size = 16, colour = 'black'), axis.title.y = element_blank(), axis.text = element_text(size = 16, colour = 'black'), legend.text = element_text(size = 16, colour = 'black'), legend.title = element_blank(), legend.background = element_blank(), legend.box.background = element_rect(size = .2), # size .5 because this is the same size set by theme_bw for plot rect legend.position = c(0.65, 0.15), legend.key = element_blank(), legend.title.align = 0.5, axis.ticks = element_line(size = 1), axis.ticks.length = unit(-0.3, 'lines'), panel.grid.major.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + labs( y = 'Increase in share of male-authored\nciting papers (percentage points)' ) + guides( colour = guide_legend( title = '', title.position = 'left', nrow = 3 ), shape = guide_legend( title = '', title.position = 'left', nrow = 3 ) ) + annotate( 'text', x = 4, size = 6, y = mean(range(coefs_tibble[, 'Coefficient'], na.rm = TRUE)), label = 'Included in M2', colour = '#0070C0', vjust = -0.25 ) + annotate( 'text', x = 4, size = 6, y = mean(range(coefs_tibble[, 'Coefficient'], na.rm = TRUE)), label = 'Included in M3', colour = '#CC00CC', vjust = 1.25 ) ggsave('res/coef_plot_focals_legend.png', plot = coef_plot_focals_legend, width = 16, height = 12, units = 'cm')