library(tidyverse) c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" c1_input_df <- read.csv(c1_count , header = TRUE) c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" c2_input_df <- read.csv(c2_count , header = TRUE) c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" c3_input_df <- read.csv(c3_count , header = TRUE) library(dplyr) # Add a column to each dataframe to label them c1_input_df <- c1_input_df |> mutate(source = "c1") c2_input_df <- c2_input_df %>% mutate(source = "c2") c3_input_df <- c3_input_df %>% mutate(source = "c3") # Combine the dataframes into one combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df) combined_task_df <- combined_df %>% filter(comment_type == "task_description") |> mutate(time_to_close = date_closed - date_created, time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) ) ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) + ggdist::stat_halfeye( adjust = 0.5, width = 1.5, # increase width scale = 8.8, # new: increase scale for fatter density .width = 0, justification = 0, point_colour = NA ) + facet_wrap(~ AuthorWMFAffil) + labs( title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source", x = "Source", y = "Time to Close (hours)" ) + theme_minimal() # Calculate proportions of status within each (AuthorWMFAffil, source) group prop_df <- combined_task_df %>% group_by(AuthorWMFAffil, source, status) %>% summarize(n = n(), .groups = "drop") %>% group_by(AuthorWMFAffil, source) %>% mutate(prop = n / sum(n)) # Plot: filled bar plot (proportion) ggplot(prop_df, aes(x = source, y = prop, fill = status)) + geom_col(position = "fill") + facet_wrap(~ AuthorWMFAffil) + scale_y_continuous(labels = scales::percent) + labs( title = "Proportion of Phabricator Task Status by Affiliation and Case", x = "Source", y = "Proportion", fill = "Status" ) + theme_minimal() library(stringr) # modal verbs modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought") modal_subset <- c('should', 'ought', 'must') whatever_subset <- c('user') # 1. Count modal verbs in each comment_text combined_task_df <- combined_task_df %>% rowwise() %>% mutate( modal_verb_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_verbs, "\\b", collapse = "|") )), modal_subset_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_subset, "\\b", collapse = "|") )), whatever_subset_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", whatever_subset, "\\b", collapse = "|") )) ) %>% ungroup() # 3. Plot (e.g., bar plot of mean modal verbs per group) ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + stat_summary( fun = mean, geom = "point", shape = 23, size = 3, color = "black", fill = "yellow", position = position_dodge(width = 0.8) ) + facet_wrap(~ AuthorWMFAffil) + labs( title = "Distribution and Mean of 'user' by Affiliation and Source", x = "Source", y = "Count" ) + theme_minimal() binned_task_df <- combined_task_df |> mutate(description_length = nchar(comment_text)) ggplot(binned_task_df, aes(x = time_to_close_hours, y = priority_score, color = source)) + geom_point(alpha = 0.6) + # Points, with some transparency geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band theme_minimal()