From ab1fe8e051e245128a0d08b13536179d1e38762c Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Mon, 23 Jun 2025 14:37:15 -0700 Subject: [PATCH] crossectional EDA for phase 2 of the project, need to make it longitudinal --- 062325_EDA.R | 103 ++++++++++++++++++ ...ut => mgaughan-rstudio-server_27074957.out | 7 +- phab_data_exploration.R | 4 + 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 062325_EDA.R rename mgaughan-rstudio-server_26402644.out => mgaughan-rstudio-server_27074957.out (67%) create mode 100644 phab_data_exploration.R diff --git a/062325_EDA.R b/062325_EDA.R new file mode 100644 index 0000000..e8e8dde --- /dev/null +++ b/062325_EDA.R @@ -0,0 +1,103 @@ +library(tidyverse) + +c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv" +c1_input_df <- read.csv(c1_count , header = TRUE) + +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv" +c2_input_df <- read.csv(c2_count , header = TRUE) + +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv" +c3_input_df <- read.csv(c3_count , header = TRUE) + +library(dplyr) + +# Add a column to each dataframe to label them +c1_input_df <- c1_input_df |> mutate(source = "c1") +c2_input_df <- c2_input_df %>% mutate(source = "c2") +c3_input_df <- c3_input_df %>% mutate(source = "c3") + +# Combine the dataframes into one +combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df) + +combined_task_df <- combined_df %>% + filter(comment_type == "task_description") |> + mutate(time_to_close = date_closed - date_created, + time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) + ) + + +ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) + + ggdist::stat_halfeye( + adjust = 0.5, + width = 1.5, # increase width + scale = 8.8, # new: increase scale for fatter density + .width = 0, + justification = 0, + point_colour = NA + ) + + facet_wrap(~ AuthorWMFAffil) + + labs( + title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source", + x = "Source", + y = "Time to Close (hours)" + ) + + theme_minimal() + +# Calculate proportions of status within each (AuthorWMFAffil, source) group +prop_df <- combined_task_df %>% + group_by(AuthorWMFAffil, source, status) %>% + summarize(n = n(), .groups = "drop") %>% + group_by(AuthorWMFAffil, source) %>% + mutate(prop = n / sum(n)) + +# Plot: filled bar plot (proportion) +ggplot(prop_df, aes(x = source, y = prop, fill = status)) + + geom_col(position = "fill") + + facet_wrap(~ AuthorWMFAffil) + + scale_y_continuous(labels = scales::percent) + + labs( + title = "Proportion of Phabricator Task Status by Affiliation and Case", + x = "Source", + y = "Proportion", + fill = "Status" + ) + + theme_minimal() + +library(stringr) +# modal verbs +modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought") +modal_subset <- c('should', 'ought', 'must') +# 1. Count modal verbs in each comment_text +combined_task_df <- combined_task_df %>% + rowwise() %>% + mutate( + modal_verb_count = sum(str_detect( + str_to_lower(comment_text), + paste0("\\b", modal_verbs, "\\b", collapse = "|") + )), + modal_subset_count = sum(str_detect( + str_to_lower(comment_text), + paste0("\\b", modal_subset, "\\b", collapse = "|") + )) + ) %>% + ungroup() + +# 3. Plot (e.g., bar plot of mean modal verbs per group) +ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) + + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + + stat_summary( + fun = mean, + geom = "point", + shape = 23, + size = 3, + color = "black", + fill = "yellow", + position = position_dodge(width = 0.8) + ) + + facet_wrap(~ AuthorWMFAffil) + + labs( + title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source", + x = "Source", + y = "Modal Verb Count" + ) + + theme_minimal() diff --git a/mgaughan-rstudio-server_26402644.out b/mgaughan-rstudio-server_27074957.out similarity index 67% rename from mgaughan-rstudio-server_26402644.out rename to mgaughan-rstudio-server_27074957.out index f2163d3..d528f93 100644 --- a/mgaughan-rstudio-server_26402644.out +++ b/mgaughan-rstudio-server_27074957.out @@ -1,17 +1,18 @@ 1. SSH tunnel from your workstation using the following command: - ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu + ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu and point your web browser to http://localhost:8787 2. log in to RStudio Server using the following credentials: user: mjilg - password: YXXLCjS/064zAiagiRdx + password: xR04Y8VD4WRBYcJKI7NH When done using RStudio Server, terminate the job by: 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 2. Issue the following command on the login node: - scancel -f 26402644 + scancel -f 27074957 +slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 *** diff --git a/phab_data_exploration.R b/phab_data_exploration.R new file mode 100644 index 0000000..392f481 --- /dev/null +++ b/phab_data_exploration.R @@ -0,0 +1,4 @@ +library(tidyverse) + +c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv " +c1_phab_df <- read.csv(c1_count , header = TRUE) \ No newline at end of file