From 2af7983fdbd7952a94076777a268de6cc999dc39 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Fri, 27 Jun 2025 14:01:23 -0700 Subject: [PATCH] cleaned data and updated with some preliminary panel groupings, more longitudinal EDA needed --- mgaughan-rstudio-server_27074957.out | 18 ------ 062325_EDA.R => p2_EDA/062325_EDA.R | 25 ++++++-- p2_EDA/clean_c2c3_phab.R | 96 ++++++++++++++++++++++++++++ p2_EDA/phab_weekly_bins.R | 74 +++++++++++++++++++++ 4 files changed, 189 insertions(+), 24 deletions(-) delete mode 100644 mgaughan-rstudio-server_27074957.out rename 062325_EDA.R => p2_EDA/062325_EDA.R (78%) create mode 100644 p2_EDA/clean_c2c3_phab.R create mode 100644 p2_EDA/phab_weekly_bins.R diff --git a/mgaughan-rstudio-server_27074957.out b/mgaughan-rstudio-server_27074957.out deleted file mode 100644 index d528f93..0000000 --- a/mgaughan-rstudio-server_27074957.out +++ /dev/null @@ -1,18 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: xR04Y8VD4WRBYcJKI7NH - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 27074957 -slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 *** diff --git a/062325_EDA.R b/p2_EDA/062325_EDA.R similarity index 78% rename from 062325_EDA.R rename to p2_EDA/062325_EDA.R index e8e8dde..c85a58d 100644 --- a/062325_EDA.R +++ b/p2_EDA/062325_EDA.R @@ -1,12 +1,12 @@ library(tidyverse) -c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv" +c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" c1_input_df <- read.csv(c1_count , header = TRUE) -c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv" +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" c2_input_df <- read.csv(c2_count , header = TRUE) -c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv" +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" c3_input_df <- read.csv(c3_count , header = TRUE) library(dplyr) @@ -67,6 +67,7 @@ library(stringr) # modal verbs modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought") modal_subset <- c('should', 'ought', 'must') +whatever_subset <- c('user') # 1. Count modal verbs in each comment_text combined_task_df <- combined_task_df %>% rowwise() %>% @@ -78,12 +79,16 @@ combined_task_df <- combined_task_df %>% modal_subset_count = sum(str_detect( str_to_lower(comment_text), paste0("\\b", modal_subset, "\\b", collapse = "|") + )), + whatever_subset_count = sum(str_detect( + str_to_lower(comment_text), + paste0("\\b", whatever_subset, "\\b", collapse = "|") )) ) %>% ungroup() # 3. Plot (e.g., bar plot of mean modal verbs per group) -ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) + +ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + stat_summary( fun = mean, @@ -96,8 +101,16 @@ ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWM ) + facet_wrap(~ AuthorWMFAffil) + labs( - title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source", + title = "Distribution and Mean of 'user' by Affiliation and Source", x = "Source", - y = "Modal Verb Count" + y = "Count" ) + theme_minimal() + +binned_task_df <- combined_task_df |> + mutate(description_length = nchar(comment_text)) + +ggplot(binned_task_df, aes(x = time_to_close_hours, y = priority_score, color = source)) + + geom_point(alpha = 0.6) + # Points, with some transparency + geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band + theme_minimal() \ No newline at end of file diff --git a/p2_EDA/clean_c2c3_phab.R b/p2_EDA/clean_c2c3_phab.R new file mode 100644 index 0000000..43e2518 --- /dev/null +++ b/p2_EDA/clean_c2c3_phab.R @@ -0,0 +1,96 @@ +library(tidyverse) +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv" +c2_input_df <- read.csv(c2_count , header = TRUE) + +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv" +c3_input_df <- read.csv(c3_count , header = TRUE) + +#sampling relevance for the text of the Task description +http_relevant <- function(text) { + if (is.na(text)) { + return(FALSE) + } + # expanded dictionary for relevancy: http, login, SSL, TLS, certificate + words <- unlist(strsplit(text, "\\s+")) + for (word in words) { + lw <- tolower(word) + if (!grepl("://", lw, fixed = TRUE)) { + # http + if (grepl("http", lw, fixed = TRUE)) { + return(TRUE) + } + # login + if (grepl("login", lw, fixed = TRUE)) { + return(TRUE) + } + # ssl + if (grepl("ssl", lw, fixed = TRUE)) { + return(TRUE) + } + # tls + if (grepl("tls", lw, fixed = TRUE)) { + return(TRUE) + } + # cert (but not "certain") + if (startsWith(lw, "cert") && !startsWith(lw, "certain")) { + return(TRUE) + } + } + } + return(FALSE) +} + +library(dplyr) + + +c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv" +c1_input_df <- read.csv(c1_count , header = TRUE) +#C1: after 11-11-2012 before 10-1-2013 +c1_desc_flags <- c1_input_df %>% + filter(comment_type == "task_description") %>% + mutate(time_flag = date_created >= 1352592000 & date_created <= 1380671999 ) |> + select(TaskPHID, time_flag) + +c1_flagged <- c1_input_df %>% + left_join(c1_desc_flags, by = "TaskPHID") + +c1_sampled <- c1_flagged |> + filter(time_flag == TRUE) +sum(c1_sampled$comment_type == "task_description") + + +write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv", row.names=FALSE) + +# going off of the dateCreated of the Task +# C2: after 9-3-2011 before 11-27-2013 +c2_desc_flags <- c2_input_df %>% + filter(comment_type == "task_description") %>% + mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% + mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |> + select(TaskPHID, http_flag, time_flag) + +c2_flagged <- c2_input_df %>% + left_join(c2_desc_flags, by = "TaskPHID") + +c2_sampled <- c2_flagged |> + filter(http_flag == TRUE) |> + filter(time_flag == TRUE) +sum(c2_sampled$comment_type == "task_description") + +write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE) +# C3: after 07-01-2013 before 10-01-2015 +c3_desc_flags <- c3_input_df %>% + filter(comment_type == "task_description") %>% + mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% + mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |> + select(TaskPHID, http_flag, time_flag) + +c3_flagged <- c3_input_df %>% + left_join(c3_desc_flags, by = "TaskPHID") + +c3_sampled <- c3_flagged |> + filter(http_flag == TRUE) |> + filter(time_flag == TRUE) +sum(c3_sampled$comment_type == "task_description") + +write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE) diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R new file mode 100644 index 0000000..f6e8292 --- /dev/null +++ b/p2_EDA/phab_weekly_bins.R @@ -0,0 +1,74 @@ +library(tidyverse) + +c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" +c1_input_df <- read.csv(c1_count , header = TRUE) + +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" +c2_input_df <- read.csv(c2_count , header = TRUE) + +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" +c3_input_df <- read.csv(c3_count , header = TRUE) + +#phase of feature deployments +# pre opt-in (0) +# opt-in beta (1) +# post-announcement pre-deployment (2) +# post-deployment opt-out (3) +# c1 key dates +# opt-in = as.Date("2012-12-11) +# deployment announcement = as.Date("2013-06-06") +# deployment_date <- as.Date("2013-07-01") + +c1_input_df <- c1_input_df |> + mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> + mutate(source = "c1") |> + mutate(phase = case_when( + date_created < as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) ~ 0, # pre opt-in + date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1, # opt-in beta + date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment + date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out + )) + + +# c2 key dates +# opt-in = as.Date("2011-10-03) +# deployment announcement = as.Date("2013-08-01") +# deployment_date <- as.Date("2013-08-28") + +c2_input_df <- c2_input_df |> + mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> + mutate(source = "c2") |> + mutate(phase = case_when( + date_created < as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) ~ 0, # pre opt-in + date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1, # opt-in beta + date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment + date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out + )) + +# c3 key dates +# opt-in = as.Date("2013-08-01) +# deployment announcement = as.Date("2015-06-12") +# deployment_date <- as.Date("2015-07-02") +c3_input_df <- c3_input_df %>% + mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |> + mutate(source = "c3") |> + mutate(phase = case_when( + date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 0, # pre opt-in + date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1, # opt-in beta + date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment + date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out + )) + +# Combine the dataframes into one +combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df) + +combined_task_df <- combined_df %>% + filter(comment_type == "task_description") |> + mutate(time_to_close = date_closed - date_created, + time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) + ) + +ggplot(combined_task_df, aes(x = priority_score, y = phase, color = source)) + + geom_point(alpha = 0.6) + # Points, with some transparency + geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band + theme_minimal() \ No newline at end of file