library(tidyverse) library(gridExtra) library(cowplot) library(grid) setwd('/path/to/data') # parameter constellations # note: for some combinations no results will be created ##### fo_data_opts <- c(T, F) # faculty opinion data? sim_types_opts <- c('fo_keywords', 'abstrs', 'citedrefs', 'keywords', 'subj') # fo_keywords needs to be first because of sim_level_info exclude_selfcitations_opts <- c(T, F) citing_papers_opts <- c(T, F) prob_gender_opts <- c(T, F) exclude_extreme_pairs_opts <- c(F, T) # F at first position important! ##### # this function is used to determine filenames based on the options represented by the flags # the function is used for names of files that are saved in this script # force_sim_type_fo_keywords: used for sim_level_info.RData get_filepath <- function(stub, suffix, force_sim_type_fo_keywords = F, res_folder = F, exclude_extreme_pairs = F) { file_path <- stub if (!fo_data && !force_sim_type_fo_keywords) { file_path <- paste('wos_', file_path, sep = '') } if (force_sim_type_fo_keywords || sim_type == 'fo_keywords') { file_path <- paste(file_path, '_fosims', sep = '') } else if (sim_type == 'abstrs') { file_path <- paste(file_path, '_abstrsims', sep = '') } else if (sim_type == 'citedrefs') { file_path <- paste(file_path, '_citedrefsims', sep = '') } else if (sim_type == 'keywords') { file_path <- paste(file_path, '_keywordsims', sep = '') } else if (sim_type == 'subj') { file_path <- paste(file_path, '_subjsims', sep = '') } else { stop('sim_type not set and force_sim_type_fo_keywords == F') } if (exists('exclude_selfcitations') && !exclude_selfcitations) { file_path <- paste(file_path, '_sc', sep = '') } if (exists('citing_papers') && !citing_papers) { file_path <- paste(file_path, '_cited', sep = '') } if (exists('prob_gender') && !prob_gender) { file_path <- paste(file_path, '_cgender', sep = '') } if (exclude_extreme_pairs) { file_path <- paste(file_path, '_exclextrshares', sep = '') } if (res_folder) { file_path <- paste('res/', file_path, sep = '') } if (suffix != '') { file_path <- paste(file_path, '.', suffix, sep = '') } return(file_path) } # initialize tibble for storing mean diffs mean_diffs_tibbles <- list() for (fo_data in fo_data_opts) { for (sim_type in sim_types_opts) { for (exclude_selfcitations in exclude_selfcitations_opts) { for (citing_papers in citing_papers_opts) { for (prob_gender in prob_gender_opts) { if (!fo_data && sim_type != 'abstrs') { next } if (sum(exclude_selfcitations, citing_papers, prob_gender) < 2) { next } if (!fo_data && !citing_papers) { next } if ((!fo_data || sim_type != 'fo_keywords') && sum(exclude_selfcitations, citing_papers, prob_gender) < 3) { next } message('') message('################################################################################') message('') message('read pair data...') # read data file_path <- get_filepath('focal_pairs', 'csv') if (file.exists(file_path)) { focal_pairs <- read_csv( file_path, col_types = cols( .default = 'n', focal_gender_unique = 'c', focal_gender_unique1 = 'c' ), na = c('NA', 'NaN') ) } else { message('file ', file_path, ' does not exist, skip this parameter specification') next } message('done') for (exclude_extreme_pairs in exclude_extreme_pairs_opts) { if (exclude_extreme_pairs && (sim_type != 'abstrs' || !exclude_selfcitations || !prob_gender || !citing_papers)) { next } message('') message('fo_data: ', fo_data) message('sim_type: ', sim_type) message('exclude_selfcitations: ', exclude_selfcitations) message('citing_papers: ', citing_papers) message('prob_gender: ', prob_gender) message('exclude_extreme_pairs: ', exclude_extreme_pairs) message('') # if exclude_extreme_pairs: subset data # (focal_pairs can be overwritten, because in the next iteration another data set is used, # because exclude_extreme_pairs == T is last iteration in inner for loop) if (exclude_extreme_pairs) { focal_pairs <- focal_pairs %>% filter(!extreme_shares) } # order pairs according to sim focal_pairs <- focal_pairs[order(-focal_pairs$sim), ] # create or load sim_level_info.RData # -> used for determining size of sim levels if (sim_type == 'fo_keywords') { sim_level_info <- matrix(ncol = 2, nrow = 6) # used for wos data colnames(sim_level_info) <- c('shares', 'cosine_sims') } else { file_path <- get_filepath('sim_level_info', 'RData', force_sim_type_fo_keywords = T) # sim_level_info is only created for sim_type == 'fo_keywords' --> force this flags for filename load(file_path) } ### save mean diffs in tibble ##### message('save mean diffs...') for (sim_level_fo_keywords in 0:5) { message('level ', sim_level_fo_keywords, '...') if (sim_type == 'abstrs') { no_pairs <- nrow(focal_pairs) * sim_level_info[sim_level_fo_keywords + 1, 1] focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... if (sim_level_fo_keywords == 0) { sim_level <- 0 } else { sim_level <- min(focal_pairs_hist$sim) } } else { sim_level <- sim_level_fo_keywords no_pairs <- length(which(focal_pairs$sim >= sim_level)) focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... if (sim_type == 'fo_keywords') { sim_level_info[sim_level_fo_keywords + 1, 1] <- (no_pairs / nrow(focal_pairs)) sim_level_info[sim_level_fo_keywords + 1, 2] <- focal_pairs$sim[no_pairs] } } # iterate over malecitings, femalecitings for (current_citing_gender in c('citing_m_diff_perc', 'citing_f_diff_perc')) { if (current_citing_gender == 'citing_f_diff_perc') { if (!fo_data || sim_type != 'fo_keywords' || sum(exclude_selfcitations, citing_papers, prob_gender) < 3) { next } } current_n_pairs <- length(which(!is.na(focal_pairs_hist[, current_citing_gender][[1]]))) # calculate statistics (extra subsetting [[1]] is necessary because otherwise R tries to calculate mean of a tibble) mean_emp <- mean(focal_pairs_hist[, current_citing_gender][[1]], na.rm = TRUE) sd_emp <- sd(focal_pairs_hist[, current_citing_gender][[1]], na.rm = TRUE) skewness_emp <- mean(((focal_pairs_hist[, current_citing_gender][[1]] - mean_emp) / sd_emp) ^ 3, na.rm = TRUE) if (current_citing_gender == 'citing_m_diff_perc') { filename_stub <- 'mciting' } else { filename_stub <- 'fciting' } mean_diffs_tibbles[[length(mean_diffs_tibbles) + 1]] <- tibble( pars = get_filepath(filename_stub, suffix = '', exclude_extreme_pairs = exclude_extreme_pairs, res_folder = F), mean_diff = mean_emp, sd_diff = sd_emp, n = current_n_pairs ) } } ##### ### histograms all plots one figure ##### message('create figures with all histograms...') dnorm_scaled0 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp0)) * (1 / dnorm(0, mean = 0, sd = (sd_emp0))) * maxy0) } dnorm_scaled1 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp1)) * (1 / dnorm(0, mean = 0, sd = (sd_emp1))) * maxy1) } dnorm_scaled2 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp2)) * (1 / dnorm(0, mean = 0, sd = (sd_emp2))) * maxy2) } dnorm_scaled3 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp3)) * (1 / dnorm(0, mean = 0, sd = (sd_emp3))) * maxy3) } dnorm_scaled4 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp4)) * (1 / dnorm(0, mean = 0, sd = (sd_emp4))) * maxy4) } dnorm_scaled5 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp5)) * (1 / dnorm(0, mean = 0, sd = (sd_emp5))) * maxy5) } current_title <- '' for (current_citing_gender in c('citing_m_diff_perc', 'citing_f_diff_perc')) { if (current_citing_gender == 'citing_f_diff_perc') { if (!fo_data || sim_type != 'fo_keywords' || sum(exclude_selfcitations, citing_papers, prob_gender) < 3) { next } } for (sim_level_fo_keywords in 0:5) { if (sim_type == 'abstrs') { no_pairs <- nrow(focal_pairs) * sim_level_info[sim_level_fo_keywords + 1, 1] focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... } else { sim_level <- sim_level_fo_keywords no_pairs <- length(which(focal_pairs$sim >= sim_level)) focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... } # calculate statistics (extra subsetting [[1]] is necessary because otherwise R tries to calculate mean of a tibble) assign(paste('mean_emp', sim_level_fo_keywords, sep = ''), mean(focal_pairs_hist[, current_citing_gender][[1]], na.rm = TRUE)) assign(paste('sd_emp', sim_level_fo_keywords, sep = ''), sd(focal_pairs_hist[, current_citing_gender][[1]], na.rm = TRUE)) assign( paste('skewness_emp', sim_level_fo_keywords, sep = ''), mean(((focal_pairs_hist[, current_citing_gender][[1]] - get(paste('mean_emp', sim_level_fo_keywords, sep = ''))) / get(paste('sd_emp', sim_level_fo_keywords, sep = ''))) ^ 3, na.rm = TRUE)) current_hist <- ggplot(focal_pairs_hist, aes_string(x = current_citing_gender)) + geom_histogram(fill = 'darkblue', bins = 40, alpha = .6) + xlim(c(-101, 101)) + geom_vline(xintercept = get(paste('mean_emp', sim_level_fo_keywords, sep = '')), colour = 'red', linetype = 'longdash', size = .5) + geom_vline(xintercept = 0, colour = 'black', linetype = 'solid', size = .5) + theme_classic() + theme( axis.title = element_blank(), axis.text.x = element_text(size = 10, colour = 'black'), axis.text.y = element_blank(), # axis.ticks.length = unit(-0.3, 'lines'), axis.ticks.y = element_blank(), # axis.text = element_blank(), # panel.grid.major.y = element_line(color = 'grey', size = 0.5), plot.margin = unit(c(0, 5, 0, 5), 'mm') ) + labs( title = current_title ) current_hist_built <- ggplot_build(current_hist) assign(paste('maxy', sim_level_fo_keywords, sep = ''), max(current_hist_built$data[[1]]$y)) anno_y <- (get(paste('maxy', sim_level_fo_keywords, sep = '')) - (get(paste('maxy', sim_level_fo_keywords, sep = '')) / 8)) current_hist <- current_hist + stat_function(fun = get(paste('dnorm_scaled', sim_level_fo_keywords, sep = '')), colour = 'black', linetype = 'solid', size = .5) + annotate( 'text', x = 70, y = anno_y, # label = paste('Avg. difference:\n', format(round(get(paste('mean_emp', sim_level_fo_keywords, sep = '')), 2), nsmall = 2)), colour = 'blue3', label = format(round(get(paste('mean_emp', sim_level_fo_keywords, sep = '')), 2), nsmall = 2), colour = 'red', size = 3.5 ) assign(paste('hist', sim_level_fo_keywords, sep = ''), current_hist) } all_hists <- plot_grid(hist0, hist1, hist2, hist3, hist4, hist5, ncol = 3, labels = c('A', 'B', 'C', 'D', 'E', 'F'), label_size = 11) y_grob <- textGrob( 'Frequency', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12), rot = 90 ) if (current_citing_gender == 'citing_m_diff_perc') { x_grob <- textGrob( 'Difference in share of male-authored citing papers (percentage points)', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12, lineheight = 1) ) } else { x_grob <- textGrob( 'Difference in share of female-authored citing papers (percentage points)', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12, lineheight = 1) ) } compound_plot <- grid.arrange(arrangeGrob(all_hists, left = y_grob, bottom = x_grob)) if (current_citing_gender == 'citing_m_diff_perc') { filename_stub <- paste('hists', '_mciting', sep = '') } else { filename_stub <- paste('hists', '_fciting', sep = '') } file_name <- get_filepath(filename_stub, suffix = 'png', exclude_extreme_pairs = exclude_extreme_pairs, res_folder = T) ggsave(file_name, plot = compound_plot, device = 'png', width = 160, height = 100, units = 'mm') } if (sim_type == 'fo_keywords') { file_name <- get_filepath('sim_level_info', 'RData') save(sim_level_info, file = file_name) } ##### ### aggregate over female/male focals ##### if ((sim_type == 'fo_keywords') && exclude_selfcitations && citing_papers && prob_gender && !exclude_extreme_pairs) { message('create histograms aggregated over focals...') dnorm_scaled0 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp0)) * (1 / dnorm(0, mean = 0, sd = (sd_emp0))) * maxy0) } dnorm_scaled1 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp1)) * (1 / dnorm(0, mean = 0, sd = (sd_emp1))) * maxy1) } dnorm_scaled2 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp2)) * (1 / dnorm(0, mean = 0, sd = (sd_emp2))) * maxy2) } dnorm_scaled3 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp3)) * (1 / dnorm(0, mean = 0, sd = (sd_emp3))) * maxy3) } dnorm_scaled4 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp4)) * (1 / dnorm(0, mean = 0, sd = (sd_emp4))) * maxy4) } dnorm_scaled5 <- function(x) { return(dnorm(x, mean = 0, sd = (sd_emp5)) * (1 / dnorm(0, mean = 0, sd = (sd_emp5))) * maxy5) } current_title <- '' # so far only for difference in share of male citing papers, not female citing papers # but aggregated over both female and male focal papers for (current_agg_gender in c('f', 'm')) { for (sim_level_fo_keywords in 0:5) { if (sim_type == 'abstrs') { no_pairs <- nrow(focal_pairs) * sim_level_info[sim_level_fo_keywords + 1, 1] focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... } else { sim_level <- sim_level_fo_keywords no_pairs <- length(which(focal_pairs$sim >= sim_level)) focal_pairs_hist <- focal_pairs[1:no_pairs, ] # focal_pairs is already ordered according to sim -> only get the pairs with highest sim... } if (current_agg_gender == 'f') { if (sim_type == 'abstrs') { pairs_agg_hist <- focal_pairs_hist %>% group_by(paper_f) %>% summarise(avg_diff = mean(citing_m_diff_perc, na.rm = T)) } else { pairs_agg_hist <- focal_pairs_hist %>% group_by(FOCAL_ID) %>% summarise(avg_diff = mean(citing_m_diff_perc, na.rm = T)) } } else { if (sim_type == 'abstrs') { pairs_agg_hist <- focal_pairs_hist %>% group_by(paper_m) %>% summarise(avg_diff = mean(citing_m_diff_perc, na.rm = T)) } else { pairs_agg_hist <- focal_pairs_hist %>% group_by(FOCAL_ID1) %>% summarise(avg_diff = mean(citing_m_diff_perc, na.rm = T)) } } current_n_focals <- length(which(!is.na(pairs_agg_hist$avg_diff))) # calculate statistics assign(paste('mean_emp', sim_level_fo_keywords, sep = ''), mean(pairs_agg_hist$avg_diff, na.rm = TRUE)) assign(paste('sd_emp', sim_level_fo_keywords, sep = ''), sd(pairs_agg_hist$avg_diff, na.rm = TRUE)) assign( paste('skewness_emp', sim_level_fo_keywords, sep = ''), mean(((pairs_agg_hist$avg_diff - get(paste('mean_emp', sim_level_fo_keywords, sep = ''))) / get(paste('sd_emp', sim_level_fo_keywords, sep = ''))) ^ 3, na.rm = TRUE)) current_hist <- ggplot(pairs_agg_hist, aes(x = avg_diff)) + geom_histogram(fill = 'darkblue', bins = 40, alpha = .6) + xlim(c(-101, 101)) + geom_vline(xintercept = get(paste('mean_emp', sim_level_fo_keywords, sep = '')), colour = 'red', linetype = 'longdash', size = .5) + geom_vline(xintercept = 0, colour = 'black', linetype = 'solid', size = .5) + theme_classic() + theme( axis.title = element_blank(), axis.text.x = element_text(size = 10, colour = 'black'), axis.text.y = element_blank(), # axis.ticks.length = unit(-0.3, 'lines'), axis.ticks.y = element_blank(), # axis.text = element_blank(), # panel.grid.major.y = element_line(color = 'grey', size = 0.5), plot.margin = unit(c(0, 5, 0, 5), 'mm') ) + labs( title = current_title ) current_hist_built <- ggplot_build(current_hist) assign(paste('maxy', sim_level_fo_keywords, sep = ''), max(current_hist_built$data[[1]]$y)) current_hist <- current_hist + stat_function(fun = get(paste('dnorm_scaled', sim_level_fo_keywords, sep = '')), colour = 'black', linetype = 'solid', size = .5) + annotate( 'text', x = 70, y = (get(paste('maxy', sim_level_fo_keywords, sep = '')) - (get(paste('maxy', sim_level_fo_keywords, sep = '')) / 8)), # label = paste('Avg. difference:\n', format(round(get(paste('mean_emp', sim_level_fo_keywords, sep = '')), 2), nsmall = 2)), colour = 'blue3', label = format(round(get(paste('mean_emp', sim_level_fo_keywords, sep = '')), 2), nsmall = 2), colour = 'red', size = 3.5 ) # annotate( # 'text', x = -55, y = (get(paste('maxy', sim_level_fo_keywords, sep = '')) - (get(paste('maxy', sim_level_fo_keywords, sep = '')) / 10)), # label = 'Distribution w/o homophily', colour = 'red', size = 4 # ) assign(paste('hist', sim_level_fo_keywords, sep = ''), current_hist) } all_hists <- plot_grid(hist0, hist1, hist2, hist3, hist4, hist5, ncol = 3, labels = c('A', 'B', 'C', 'D', 'E', 'F'), label_size = 11) y_grob <- textGrob( 'Frequency', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12), rot = 90 ) x_grob <- textGrob( 'Average difference in share of male-authored citing papers\nto paired papers (percentage points)', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12, lineheight = 1), ) compound_plot <- grid.arrange(arrangeGrob(all_hists, left = y_grob, bottom = x_grob)) filename_stub <- paste('hists_agg', current_agg_gender, sep = '') file_name <- get_filepath(filename_stub, suffix = 'png', exclude_extreme_pairs = exclude_extreme_pairs, res_folder = T) ggsave(file_name, plot = compound_plot, device = 'png', width = 160, height = 100, units = 'mm') } } ##### ### pair based regression ##### if (fo_data && sim_type == 'fo_keywords' && exclude_selfcitations && citing_papers && prob_gender && !exclude_extreme_pairs) { message('create regression results...') focal_pairs_cat <- focal_pairs %>% mutate( sim0 = (sim == 0), sim1 = (sim == 1), sim2 = (sim == 2), sim3 = (sim == 3), sim4 = (sim == 4), sim5 = (sim >= 5) ) lm_cat <- lm( citing_m_diff_perc ~ avg_eval_diff + sim1 + sim2 + sim3 + sim4 + sim5 + age_diff + teamsize_diff, data = focal_pairs_cat ) sink('res/pairs_regression.txt') print(summary(lm_cat)) sink() predict_data_cat <- tibble( sim1 = c(F, T, F, F, F, F), sim2 = c(F, F, T, F, F, F), sim3 = c(F, F, F, T, F, F), sim4 = c(F, F, F, F, T, F), sim5 = c(F, F, F, F, F, T), avg_eval_diff = rep(0, 6), age_diff = rep(0, 6), teamsize_diff = rep(0, 6) ) predicted_citing_m_diff <- as_tibble( predict.lm(lm_cat, newdata = predict_data_cat, interval = 'confidence') ) predicted_citing_m_diff$sim <- 0:5 plot_predicted_citing_m_diffs <- ggplot(data = predicted_citing_m_diff, aes(x = sim, y = fit)) + geom_errorbar(aes(ymin = lwr, ymax = upr), width = .2, size = .9, colour = 'midnightblue') + geom_line(colour = 'midnightblue', size = .9) + geom_point(colour = 'midnightblue', size = 3.5, shape = 18) + ylim(0, 15) + theme_classic() + theme( panel.grid.major.y = element_line(colour = 'grey60', size = .5), panel.grid.minor.y = element_line(colour = 'grey80', size = .5), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18), axis.text = element_text(size = 14) ) + labs( x = 'Number of shared keywords', y = 'Predicted difference in share of\nmale-authored citing papers' ) ggsave('res/focal_pairs_pred_plot.png', plot = plot_predicted_citing_m_diffs, width = 20, height = 11, units = 'cm') } ##### message('') warnings() message('') message('################################################################################') message('') } } } } } } ### plot only showing the mean differences for each similarity level ##### mean_diffs <- bind_rows(mean_diffs_tibbles) mean_diffs <- mean_diffs %>% group_by(pars) %>% arrange(desc(n)) %>% mutate(sim_level = row_number()) mean_diffs$sim_level <- mean_diffs$sim_level - 1 mean_diffs$sim_level <- as.character(mean_diffs$sim_level) mean_diffs$se_diff <- mean_diffs$sd_diff / sqrt(mean_diffs$n) mean_diffs$ci_half <- qnorm(.975) * mean_diffs$se_diff plot_list <- list() results_to_plot <- c( 'mciting_fosims', 'mciting_abstrsims', 'mciting_citedrefsims', 'mciting_keywordsims', 'mciting_subjsims', 'wos_mciting_abstrsims' ) for(current_result in results_to_plot) { if (current_result == 'mciting_subjsims') { yaxis_upper_lim <- 26 } else { yaxis_upper_lim <- 15 } current_plot <- ggplot(mean_diffs[mean_diffs$pars == current_result, ], aes(y = mean_diff, x = sim_level)) + geom_pointrange( aes(ymin = mean_diff - ci_half, ymax = mean_diff + ci_half), colour = '#0072B2', size = .8, fill = 'white', shape = 18 ) + theme_classic() + theme( axis.title = element_blank(), axis.text.x = element_text(size = 10, colour = 'black'), axis.text.y = element_text(size = 10, colour = 'black'), plot.margin = unit(c(5, 3, 5, 3), 'mm'), legend.position = 'none' ) + ylim(-0.3, yaxis_upper_lim) plot_list[[length(plot_list) + 1]] <- current_plot } all_plots <- plot_grid(plotlist = plot_list, ncol = 3, labels = c('A', 'B', 'C', 'D', 'E', 'F'), vjust = .5, label_size = 11) y_grob <- textGrob( 'Average difference in share of male-authored\nciting papers (percentage points)', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12), rot = 90 ) x_grob <- textGrob( 'Similarity level', gp = gpar(fontfamily = 'Helvetica', fontface = 'bold', col = 'black', fontsize = 12, lineheight = 1), ) top_grob <- textGrob( '' ) compound_plot <- grid.arrange(arrangeGrob(all_plots, left = y_grob, bottom = x_grob, top = top_grob)) ggsave('res/mean_diffs.png', plot = compound_plot, device = 'png', width = 160, height = 120, units = 'mm') #####