From 2af7983fdbd7952a94076777a268de6cc999dc39 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login03.hyak.local>
Date: Fri, 27 Jun 2025 14:01:23 -0700
Subject: [PATCH] cleaned data and updated with some preliminary panel
 groupings, more longitudinal EDA needed

---
 mgaughan-rstudio-server_27074957.out | 18 ------
 062325_EDA.R => p2_EDA/062325_EDA.R  | 25 ++++++--
 p2_EDA/clean_c2c3_phab.R             | 96 ++++++++++++++++++++++++++++
 p2_EDA/phab_weekly_bins.R            | 74 +++++++++++++++++++++
 4 files changed, 189 insertions(+), 24 deletions(-)
 delete mode 100644 mgaughan-rstudio-server_27074957.out
 rename 062325_EDA.R => p2_EDA/062325_EDA.R (78%)
 create mode 100644 p2_EDA/clean_c2c3_phab.R
 create mode 100644 p2_EDA/phab_weekly_bins.R

diff --git a/mgaughan-rstudio-server_27074957.out b/mgaughan-rstudio-server_27074957.out
deleted file mode 100644
index d528f93..0000000
--- a/mgaughan-rstudio-server_27074957.out
+++ /dev/null
@@ -1,18 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: xR04Y8VD4WRBYcJKI7NH
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 27074957
-slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 ***
diff --git a/062325_EDA.R b/p2_EDA/062325_EDA.R
similarity index 78%
rename from 062325_EDA.R
rename to p2_EDA/062325_EDA.R
index e8e8dde..c85a58d 100644
--- a/062325_EDA.R
+++ b/p2_EDA/062325_EDA.R
@@ -1,12 +1,12 @@
 library(tidyverse)
 
-c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
+c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
 c1_input_df <- read.csv(c1_count , header = TRUE) 
 
-c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
 c2_input_df <- read.csv(c2_count , header = TRUE) 
 
-c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 
 
 library(dplyr)
@@ -67,6 +67,7 @@ library(stringr)
 # modal verbs 
 modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
 modal_subset <- c('should', 'ought', 'must')
+whatever_subset <- c('user')
 # 1. Count modal verbs in each comment_text
 combined_task_df <- combined_task_df %>%
   rowwise() %>%
@@ -78,12 +79,16 @@ combined_task_df <- combined_task_df %>%
     modal_subset_count = sum(str_detect(
       str_to_lower(comment_text),
       paste0("\\b", modal_subset, "\\b", collapse = "|")
+    )),
+    whatever_subset_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", whatever_subset, "\\b", collapse = "|")
     ))
   ) %>%
   ungroup()
 
 # 3. Plot (e.g., bar plot of mean modal verbs per group)
-ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) +
+ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) +
   geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
   stat_summary(
     fun = mean,
@@ -96,8 +101,16 @@ ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWM
   ) +
   facet_wrap(~ AuthorWMFAffil) +
   labs(
-    title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source",
+    title = "Distribution and Mean of 'user' by Affiliation and Source",
     x = "Source",
-    y = "Modal Verb Count"
+    y = "Count"
   ) +
   theme_minimal()
+
+binned_task_df <- combined_task_df |>
+  mutate(description_length = nchar(comment_text))
+
+ggplot(binned_task_df, aes(x = time_to_close_hours, y = priority_score, color = source)) +
+  geom_point(alpha = 0.6) +                # Points, with some transparency
+  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
+  theme_minimal()
\ No newline at end of file
diff --git a/p2_EDA/clean_c2c3_phab.R b/p2_EDA/clean_c2c3_phab.R
new file mode 100644
index 0000000..43e2518
--- /dev/null
+++ b/p2_EDA/clean_c2c3_phab.R
@@ -0,0 +1,96 @@
+library(tidyverse)
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
+c2_input_df <- read.csv(c2_count , header = TRUE) 
+
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
+c3_input_df <- read.csv(c3_count , header = TRUE) 
+
+#sampling relevance for the text of the Task description
+http_relevant <- function(text) {
+  if (is.na(text)) {
+    return(FALSE)
+  }
+  # expanded dictionary for relevancy: http, login, SSL, TLS, certificate
+  words <- unlist(strsplit(text, "\\s+"))
+  for (word in words) {
+    lw <- tolower(word)
+    if (!grepl("://", lw, fixed = TRUE)) {
+      # http
+      if (grepl("http", lw, fixed = TRUE)) {
+        return(TRUE)
+      }
+      # login
+      if (grepl("login", lw, fixed = TRUE)) {
+        return(TRUE)
+      }
+      # ssl
+      if (grepl("ssl", lw, fixed = TRUE)) {
+        return(TRUE)
+      }
+      # tls
+      if (grepl("tls", lw, fixed = TRUE)) {
+        return(TRUE)
+      }
+      # cert (but not "certain")
+      if (startsWith(lw, "cert") && !startsWith(lw, "certain")) {
+        return(TRUE)
+      }
+    }
+  }
+  return(FALSE)
+}
+
+library(dplyr)
+
+
+c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
+c1_input_df <- read.csv(c1_count , header = TRUE) 
+#C1: after 11-11-2012 before 10-1-2013
+c1_desc_flags <- c1_input_df %>%
+  filter(comment_type == "task_description") %>%
+  mutate(time_flag = date_created >= 1352592000 & date_created <= 1380671999 ) |>
+  select(TaskPHID, time_flag)
+
+c1_flagged <- c1_input_df %>%
+  left_join(c1_desc_flags, by = "TaskPHID")
+
+c1_sampled <- c1_flagged |>
+  filter(time_flag == TRUE)
+sum(c1_sampled$comment_type == "task_description")
+
+
+write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv", row.names=FALSE)
+
+# going off of the dateCreated of the Task
+# C2: after 9-3-2011 before 11-27-2013 
+c2_desc_flags <- c2_input_df %>%
+  filter(comment_type == "task_description") %>%
+  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |>
+  select(TaskPHID, http_flag, time_flag)
+
+c2_flagged <- c2_input_df %>%
+  left_join(c2_desc_flags, by = "TaskPHID")
+
+c2_sampled <- c2_flagged |>
+  filter(http_flag == TRUE) |>
+  filter(time_flag == TRUE)
+sum(c2_sampled$comment_type == "task_description")
+
+write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE)
+# C3: after 07-01-2013 before 10-01-2015
+c3_desc_flags <- c3_input_df %>%
+  filter(comment_type == "task_description") %>%
+  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |>
+  select(TaskPHID, http_flag, time_flag)
+
+c3_flagged <- c3_input_df %>%
+  left_join(c3_desc_flags, by = "TaskPHID")
+
+c3_sampled <- c3_flagged |>
+  filter(http_flag == TRUE) |>
+  filter(time_flag == TRUE)
+sum(c3_sampled$comment_type == "task_description")
+
+write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE)
diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R
new file mode 100644
index 0000000..f6e8292
--- /dev/null
+++ b/p2_EDA/phab_weekly_bins.R
@@ -0,0 +1,74 @@
+library(tidyverse)
+
+c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
+c1_input_df <- read.csv(c1_count , header = TRUE) 
+
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
+c2_input_df <- read.csv(c2_count , header = TRUE) 
+
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
+c3_input_df <- read.csv(c3_count , header = TRUE) 
+
+#phase of feature deployments 
+# pre opt-in (0)
+# opt-in beta (1)
+# post-announcement pre-deployment (2)
+# post-deployment opt-out (3) 
+# c1 key dates
+# opt-in = as.Date("2012-12-11)
+# deployment announcement = as.Date("2013-06-06")
+# deployment_date <- as.Date("2013-07-01")
+
+c1_input_df <- c1_input_df |> 
+  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
+  mutate(source = "c1") |>
+  mutate(phase = case_when(
+    date_created < as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) ~ 0,                                 # pre opt-in
+    date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1,  # opt-in beta
+    date_created >=  as.numeric(as.POSIXct("2013-06-06", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
+    date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3                             # post-deployment opt-out
+  ))
+
+
+# c2 key dates
+# opt-in = as.Date("2011-10-03)
+# deployment announcement = as.Date("2013-08-01")
+# deployment_date <- as.Date("2013-08-28")
+
+c2_input_df <- c2_input_df |>
+  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
+  mutate(source = "c2") |>
+  mutate(phase = case_when(
+    date_created < as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) ~ 0,                                 # pre opt-in
+    date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1,  # opt-in beta
+    date_created >=  as.numeric(as.POSIXct("2013-08-01", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
+    date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3                             # post-deployment opt-out
+  ))
+
+# c3 key dates 
+# opt-in = as.Date("2013-08-01)
+# deployment announcement = as.Date("2015-06-12")
+# deployment_date <- as.Date("2015-07-02")
+c3_input_df <- c3_input_df %>%  
+  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
+  mutate(source = "c3") |>
+  mutate(phase = case_when(
+    date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 0,                                 # pre opt-in
+    date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1,  # opt-in beta
+    date_created >=  as.numeric(as.POSIXct("2015-06-12", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
+    date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3                             # post-deployment opt-out
+  ))
+
+# Combine the dataframes into one
+combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
+
+combined_task_df <- combined_df %>% 
+  filter(comment_type == "task_description") |>
+  mutate(time_to_close = date_closed - date_created,
+         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
+  ) 
+
+ggplot(combined_task_df, aes(x = priority_score, y = phase, color = source)) +
+  geom_point(alpha = 0.6) +                # Points, with some transparency
+  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
+  theme_minimal()        
\ No newline at end of file