library(tidyverse) c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" c1_input_df <- read.csv(c1_count , header = TRUE) c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv" c2_input_df <- read.csv(c2_count , header = TRUE) c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv" c3_input_df <- read.csv(c3_count , header = TRUE) #getting the relative weeks to the publication date relative_week <- function(date, ref_date) { as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) } #phase of feature deployments # pre opt-in (0) # opt-in beta (1) # post-announcement pre-deployment (2) # post-deployment opt-out (3) # c1 key dates # opt-in = as.Date("2012-12-11) # deployment announcement = as.Date("2013-06-06") # deployment_date <- as.Date("2013-07-01") library(dplyr) c1_input_df <- c1_input_df |> mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> mutate(source = "c1") |> mutate(phase = case_when( date_created < as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) ~ 0, # pre opt-in date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1, # opt-in beta date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out )) |> mutate(week_index = relative_week(date_created, as.Date("2013-07-01"))) # c2 key dates # opt-in = as.Date("2011-10-03) # deployment announcement = as.Date("2013-08-01") # deployment_date <- as.Date("2013-08-28") c2_input_df <- c2_input_df |> mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> mutate(source = "c2") |> mutate(phase = case_when( date_created < as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) ~ 0, # pre opt-in date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1, # opt-in beta date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out )) |> mutate(week_index = relative_week(date_created, as.Date("2013-08-28"))) # c3 key dates # opt-in = as.Date("2013-08-01) # deployment announcement = as.Date("2015-06-12") # deployment_date <- as.Date("2015-07-02") c3_input_df <- c3_input_df %>% mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> mutate(source = "c3") |> mutate(phase = case_when( date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 0, # pre opt-in date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1, # opt-in beta date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out )) |> mutate(week_index = relative_week(date_created, as.Date("2015-07-02"))) # Combine the dataframes into one combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df) modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought") modal_subset <- c('should', 'ought', 'must') whatever_subset <- c('user') combined_df <- combined_df %>% group_by(AuthorPHID, source) %>% arrange(date_created, .by_group = TRUE) %>% mutate( task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"), comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment") ) %>% ungroup() |> rowwise() %>% mutate( modal_verb_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_verbs, "\\b", collapse = "|") )), modal_subset_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_subset, "\\b", collapse = "|") )), user_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", whatever_subset, "\\b", collapse = "|") )) ) %>% ungroup() |> filter(week_index <= 13) combined_task_df <- combined_df %>% add_count(TaskPHID, name = "TaskPHID_count") |> filter(comment_type == "task_description") |> mutate(time_to_close = date_closed - date_created, time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) ) |> group_by(AuthorPHID, source) %>% arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created mutate(task_index = row_number()) %>% ungroup() ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) + geom_point(alpha = 0.6) + # Points, with some transparency geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band theme_minimal() library(stringr) # 1. Count modal verbs in each task comment_text combined_task_df <- combined_task_df %>% rowwise() %>% mutate( modal_verb_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_verbs, "\\b", collapse = "|") )), modal_subset_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_subset, "\\b", collapse = "|") )), user_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", whatever_subset, "\\b", collapse = "|") )) ) %>% ungroup() library(ggdist) ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) + geom_point(alpha=0.1) + # Points, with some transparency geom_smooth(method = "loess", se = FALSE) + theme_minimal() combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000) ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) + geom_smooth(method = "loess", se = TRUE) + geom_point(alpha=0.1) + theme_minimal()