From edcb174d42cd378dc25b2b2430bcd9b25a77b5e1 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login01.hyak.local>
Date: Mon, 30 Jun 2025 11:30:27 -0700
Subject: [PATCH] updated preliminary phabricator EDA with things re:
 longitudinal data

---
 p2_EDA/phab_weekly_bins.R | 93 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R
index f6e8292..d059947 100644
--- a/p2_EDA/phab_weekly_bins.R
+++ b/p2_EDA/phab_weekly_bins.R
@@ -9,6 +9,11 @@ c2_input_df <- read.csv(c2_count , header = TRUE)
 c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 
 
+#getting the relative weeks to the publication date
+relative_week <- function(date, ref_date) {
+  as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
+}
+
 #phase of feature deployments 
 # pre opt-in (0)
 # opt-in beta (1)
@@ -18,7 +23,7 @@ c3_input_df <- read.csv(c3_count , header = TRUE)
 # opt-in = as.Date("2012-12-11)
 # deployment announcement = as.Date("2013-06-06")
 # deployment_date <- as.Date("2013-07-01")
-
+library(dplyr)
 c1_input_df <- c1_input_df |> 
   mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
   mutate(source = "c1") |>
@@ -27,7 +32,8 @@ c1_input_df <- c1_input_df |>
     date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1,  # opt-in beta
     date_created >=  as.numeric(as.POSIXct("2013-06-06", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
     date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3                             # post-deployment opt-out
-  ))
+  )) |>
+  mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))
 
 
 # c2 key dates
@@ -43,7 +49,8 @@ c2_input_df <- c2_input_df |>
     date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1,  # opt-in beta
     date_created >=  as.numeric(as.POSIXct("2013-08-01", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
     date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3                             # post-deployment opt-out
-  ))
+  )) |>
+  mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))
 
 # c3 key dates 
 # opt-in = as.Date("2013-08-01)
@@ -57,18 +64,90 @@ c3_input_df <- c3_input_df %>%
     date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1,  # opt-in beta
     date_created >=  as.numeric(as.POSIXct("2015-06-12", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
     date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3                             # post-deployment opt-out
-  ))
+  )) |>
+  mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))
 
 # Combine the dataframes into one
 combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
 
+modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
+modal_subset <- c('should', 'ought', 'must')
+whatever_subset <- c('user')
+
+combined_df <- combined_df %>%
+  group_by(AuthorPHID, source) %>%
+  arrange(date_created, .by_group = TRUE) %>%
+  mutate(
+    task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
+    comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment")
+  ) %>%
+  ungroup() |> 
+  rowwise() %>%
+  mutate(
+    modal_verb_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_verbs, "\\b", collapse = "|")
+    )),
+    modal_subset_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_subset, "\\b", collapse = "|")
+    )),
+    user_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", whatever_subset, "\\b", collapse = "|")
+    ))
+  ) %>%
+  ungroup() |>
+  filter(week_index <= 13)
+
+
 combined_task_df <- combined_df %>% 
+  add_count(TaskPHID, name = "TaskPHID_count") |>
   filter(comment_type == "task_description") |>
   mutate(time_to_close = date_closed - date_created,
          time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
-  ) 
+  ) |>
+  group_by(AuthorPHID, source) %>%
+  arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
+  mutate(task_index = row_number()) %>%
+  ungroup()
 
-ggplot(combined_task_df, aes(x = priority_score, y = phase, color = source)) +
+ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) +
   geom_point(alpha = 0.6) +                # Points, with some transparency
   geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
-  theme_minimal()        
\ No newline at end of file
+  theme_minimal()        
+
+library(stringr)
+
+# 1. Count modal verbs in each task comment_text
+combined_task_df <- combined_task_df %>%
+  rowwise() %>%
+  mutate(
+    modal_verb_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_verbs, "\\b", collapse = "|")
+    )),
+    modal_subset_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_subset, "\\b", collapse = "|")
+    )),
+    user_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", whatever_subset, "\\b", collapse = "|")
+    ))
+  ) %>%
+  ungroup()
+library(ggdist)
+ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) +
+  geom_point(alpha=0.1) +             # Points, with some transparency
+  geom_smooth(method = "loess", se = FALSE) + 
+  theme_minimal()      
+
+
+
+combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000)
+
+ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) +
+  geom_smooth(method = "loess", se = TRUE) + 
+  geom_point(alpha=0.1) +   
+  theme_minimal()