crossectional EDA for phase 2 of the project, need to make it longitudinal

2025-06-23 14:37:15 -07:00 · 2025-06-23 14:37:15 -07:00 · ab1fe8e051
commit ab1fe8e051
parent fd1479775d
3 changed files with 111 additions and 3 deletions
--- a/062325_EDA.R
+++ b/062325_EDA.R
@ -0,0 +1,103 @@
+library(tidyverse)
+
+c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
+c1_input_df <- read.csv(c1_count , header = TRUE) 
+
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
+c2_input_df <- read.csv(c2_count , header = TRUE) 
+
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
+c3_input_df <- read.csv(c3_count , header = TRUE) 
+
+library(dplyr)
+
+# Add a column to each dataframe to label them
+c1_input_df <- c1_input_df |> mutate(source = "c1")
+c2_input_df <- c2_input_df %>% mutate(source = "c2")
+c3_input_df <- c3_input_df %>% mutate(source = "c3")
+
+# Combine the dataframes into one
+combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
+
+combined_task_df <- combined_df %>% 
+  filter(comment_type == "task_description") |>
+  mutate(time_to_close = date_closed - date_created,
+         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
+  ) 
+
+
+ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) +
+  ggdist::stat_halfeye(
+    adjust = 0.5, 
+    width = 1.5,         # increase width
+    scale = 8.8,         # new: increase scale for fatter density
+    .width = 0, 
+    justification = 0, 
+    point_colour = NA
+  ) +
+  facet_wrap(~ AuthorWMFAffil) +
+  labs(
+    title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source",
+    x = "Source",
+    y = "Time to Close (hours)"
+  ) +
+  theme_minimal()
+
+# Calculate proportions of status within each (AuthorWMFAffil, source) group
+prop_df <- combined_task_df %>%
+  group_by(AuthorWMFAffil, source, status) %>%
+  summarize(n = n(), .groups = "drop") %>%
+  group_by(AuthorWMFAffil, source) %>%
+  mutate(prop = n / sum(n))
+
+# Plot: filled bar plot (proportion)
+ggplot(prop_df, aes(x = source, y = prop, fill = status)) +
+  geom_col(position = "fill") +
+  facet_wrap(~ AuthorWMFAffil) +
+  scale_y_continuous(labels = scales::percent) +
+  labs(
+    title = "Proportion of Phabricator Task Status by Affiliation and Case",
+    x = "Source",
+    y = "Proportion",
+    fill = "Status"
+  ) +
+  theme_minimal()
+
+library(stringr)
+# modal verbs 
+modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
+modal_subset <- c('should', 'ought', 'must')
+# 1. Count modal verbs in each comment_text
+combined_task_df <- combined_task_df %>%
+  rowwise() %>%
+  mutate(
+    modal_verb_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_verbs, "\\b", collapse = "|")
+    )),
+    modal_subset_count = sum(str_detect(
+      str_to_lower(comment_text),
+      paste0("\\b", modal_subset, "\\b", collapse = "|")
+    ))
+  ) %>%
+  ungroup()
+
+# 3. Plot (e.g., bar plot of mean modal verbs per group)
+ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) +
+  geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
+  stat_summary(
+    fun = mean,
+    geom = "point",
+    shape = 23,
+    size = 3,
+    color = "black",
+    fill = "yellow",
+    position = position_dodge(width = 0.8)
+  ) +
+  facet_wrap(~ AuthorWMFAffil) +
+  labs(
+    title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source",
+    x = "Source",
+    y = "Modal Verb Count"
+  ) +
+  theme_minimal()
--- a/mgaughan-rstudio-server_27074957.out
+++ b/mgaughan-rstudio-server_27074957.out
@ -1,17 +1,18 @@
 1. SSH tunnel from your workstation using the following command:

-   ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu
+   ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu

   and point your web browser to http://localhost:8787

 2. log in to RStudio Server using the following credentials:

   user: mjilg
-   password: YXXLCjS/064zAiagiRdx
+   password: xR04Y8VD4WRBYcJKI7NH

 When done using RStudio Server, terminate the job by:

 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:

-      scancel -f 26402644
+      scancel -f 27074957
+slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 ***
--- a/phab_data_exploration.R
+++ b/phab_data_exploration.R
@ -0,0 +1,4 @@
+library(tidyverse)
+
+c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
+c1_phab_df <- read.csv(c1_count , header = TRUE)