From a4d8685c130b9aeadc153f2b4936b1922ed62bb6 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 1 Jul 2025 07:45:13 -0700 Subject: [PATCH] updating with new sampling approach for c2 and c3 --- .sh_history | 5 +++++ p2_EDA/062325_EDA.R | 6 +++--- p2_EDA/clean_c2c3_phab.R | 10 ++++++---- p2_EDA/phab_weekly_bins.R | 7 ++++--- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.sh_history b/.sh_history index 4c939cd..2601cb3 100644 --- a/.sh_history +++ b/.sh_history @@ -176,3 +176,8 @@ ls case1 ls cd case2 ls +cd .. +ls +cd case3 +ls +rm 062725_c3_title_cleaned.csv diff --git a/p2_EDA/062325_EDA.R b/p2_EDA/062325_EDA.R index c85a58d..d24d37d 100644 --- a/p2_EDA/062325_EDA.R +++ b/p2_EDA/062325_EDA.R @@ -3,10 +3,10 @@ library(tidyverse) c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" c1_input_df <- read.csv(c1_count , header = TRUE) -c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv" c2_input_df <- read.csv(c2_count , header = TRUE) -c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv" c3_input_df <- read.csv(c3_count , header = TRUE) library(dplyr) @@ -88,7 +88,7 @@ combined_task_df <- combined_task_df %>% ungroup() # 3. Plot (e.g., bar plot of mean modal verbs per group) -ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) + +ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + stat_summary( fun = mean, diff --git a/p2_EDA/clean_c2c3_phab.R b/p2_EDA/clean_c2c3_phab.R index 43e2518..db99f53 100644 --- a/p2_EDA/clean_c2c3_phab.R +++ b/p2_EDA/clean_c2c3_phab.R @@ -65,10 +65,12 @@ write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/ca # C2: after 9-3-2011 before 11-27-2013 c2_desc_flags <- c2_input_df %>% filter(comment_type == "task_description") %>% - mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% + mutate(http_flag = sapply(task_title, http_relevant)) |> mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |> select(TaskPHID, http_flag, time_flag) +#mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% + c2_flagged <- c2_input_df %>% left_join(c2_desc_flags, by = "TaskPHID") @@ -77,11 +79,11 @@ c2_sampled <- c2_flagged |> filter(time_flag == TRUE) sum(c2_sampled$comment_type == "task_description") -write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE) +write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE) # C3: after 07-01-2013 before 10-01-2015 c3_desc_flags <- c3_input_df %>% filter(comment_type == "task_description") %>% - mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% + mutate(http_flag = sapply(task_title, http_relevant)) |> mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |> select(TaskPHID, http_flag, time_flag) @@ -93,4 +95,4 @@ c3_sampled <- c3_flagged |> filter(time_flag == TRUE) sum(c3_sampled$comment_type == "task_description") -write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE) +write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE) diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R index d059947..60632c6 100644 --- a/p2_EDA/phab_weekly_bins.R +++ b/p2_EDA/phab_weekly_bins.R @@ -3,12 +3,13 @@ library(tidyverse) c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" c1_input_df <- read.csv(c1_count , header = TRUE) -c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" +c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv" c2_input_df <- read.csv(c2_count , header = TRUE) -c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" +c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv" c3_input_df <- read.csv(c3_count , header = TRUE) + #getting the relative weeks to the publication date relative_week <- function(date, ref_date) { as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) @@ -138,7 +139,7 @@ combined_task_df <- combined_task_df %>% ) %>% ungroup() library(ggdist) -ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) + +ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) + geom_point(alpha=0.1) + # Points, with some transparency geom_smooth(method = "loess", se = FALSE) + theme_minimal()