From ab1fe8e051e245128a0d08b13536179d1e38762c Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login01.hyak.local>
Date: Mon, 23 Jun 2025 14:37:15 -0700
Subject: [PATCH] crossectional EDA for phase 2 of the project, need to make it
 longitudinal

---
 062325_EDA.R                                  | 103 ++++++++++++++++++
 ...ut => mgaughan-rstudio-server_27074957.out |   7 +-
 phab_data_exploration.R                       |   4 +
 3 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 062325_EDA.R
 rename mgaughan-rstudio-server_26402644.out => mgaughan-rstudio-server_27074957.out (67%)
 create mode 100644 phab_data_exploration.R

diff --git a/062325_EDA.R b/062325_EDA.R
new file mode 100644
index 0000000..e8e8dde
--- /dev/null
+++ b/062325_EDA.R
@@ -0,0 +1,103 @@
+library(tidyverse)
+
+c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
+c1_input_df <- read.csv(c1_count , header = TRUE) 
+
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
+c2_input_df <- read.csv(c2_count , header = TRUE) 
+
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
+c3_input_df <- read.csv(c3_count , header = TRUE) 
+
+library(dplyr)
+
+# Add a column to each dataframe to label them
+c1_input_df <- c1_input_df |> mutate(source = "c1")
+c2_input_df <- c2_input_df %>% mutate(source = "c2")
+c3_input_df <- c3_input_df %>% mutate(source = "c3")
+
+# Combine the dataframes into one
+combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
+
+combined_task_df <- combined_df %>% 
+  filter(comment_type == "task_description") |>
+  mutate(time_to_close = date_closed - date_created,
+         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
+  ) 
+
+
+ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) +
+  ggdist::stat_halfeye(
+    adjust = 0.5, 
+    width = 1.5,         # increase width
+    scale = 8.8,         # new: increase scale for fatter density
+    .width = 0, 
+    justification = 0, 
+    point_colour = NA
+  ) +
+  facet_wrap(~ AuthorWMFAffil) +
+  labs(
+    title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source",
+    x = "Source",
+    y = "Time to Close (hours)"
+  ) +
+  theme_minimal()
+
+# Calculate proportions of status within each (AuthorWMFAffil, source) group
+prop_df <- combined_task_df %>%
+  group_by(AuthorWMFAffil, source, status) %>%
+  summarize(n = n(), .groups = "drop") %>%
+  group_by(AuthorWMFAffil, source) %>%
+  mutate(prop = n / sum(n))
+
+# Plot: filled bar plot (proportion)
+ggplot(prop_df, aes(x = source, y = prop, fill = status)) +
+  geom_col(position = "fill") +
+  facet_wrap(~ AuthorWMFAffil) +
+  scale_y_continuous(labels = scales::percent) +
+  labs(
+    title = "Proportion of Phabricator Task Status by Affiliation and Case",
+    x = "Source",
+    y = "Proportion",
+    fill = "Status"
+  ) +
+  theme_minimal()
+
+library(stringr)
+# modal verbs 
+modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
+modal_subset <- c('should', 'ought', 'must')
+# 1. Count modal verbs in each comment_text
+combined_task_df <- combined_task_df %>%
+  rowwise() %>%
+  mutate(
+    modal_verb_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_verbs, "\\b", collapse = "|")
+    )),
+    modal_subset_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_subset, "\\b", collapse = "|")
+    ))
+  ) %>%
+  ungroup()
+
+# 3. Plot (e.g., bar plot of mean modal verbs per group)
+ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) +
+  geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
+  stat_summary(
+    fun = mean,
+    geom = "point",
+    shape = 23,
+    size = 3,
+    color = "black",
+    fill = "yellow",
+    position = position_dodge(width = 0.8)
+  ) +
+  facet_wrap(~ AuthorWMFAffil) +
+  labs(
+    title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source",
+    x = "Source",
+    y = "Modal Verb Count"
+  ) +
+  theme_minimal()
diff --git a/mgaughan-rstudio-server_26402644.out b/mgaughan-rstudio-server_27074957.out
similarity index 67%
rename from mgaughan-rstudio-server_26402644.out
rename to mgaughan-rstudio-server_27074957.out
index f2163d3..d528f93 100644
--- a/mgaughan-rstudio-server_26402644.out
+++ b/mgaughan-rstudio-server_27074957.out
@@ -1,17 +1,18 @@
 1. SSH tunnel from your workstation using the following command:
 
-   ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu
+   ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu
 
    and point your web browser to http://localhost:8787
 
 2. log in to RStudio Server using the following credentials:
 
    user: mjilg
-   password: YXXLCjS/064zAiagiRdx
+   password: xR04Y8VD4WRBYcJKI7NH
 
 When done using RStudio Server, terminate the job by:
 
 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:
 
-      scancel -f 26402644
+      scancel -f 27074957
+slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 ***
diff --git a/phab_data_exploration.R b/phab_data_exploration.R
new file mode 100644
index 0000000..392f481
--- /dev/null
+++ b/phab_data_exploration.R
@@ -0,0 +1,4 @@
+library(tidyverse)
+
+c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
+c1_phab_df <- read.csv(c1_count , header = TRUE) 
\ No newline at end of file