From a4d8685c130b9aeadc153f2b4936b1922ed62bb6 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login01.hyak.local>
Date: Tue, 1 Jul 2025 07:45:13 -0700
Subject: [PATCH] updating with new sampling approach for c2 and c3

---
 .sh_history               |  5 +++++
 p2_EDA/062325_EDA.R       |  6 +++---
 p2_EDA/clean_c2c3_phab.R  | 10 ++++++----
 p2_EDA/phab_weekly_bins.R |  7 ++++---
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.sh_history b/.sh_history
index 4c939cd..2601cb3 100644
--- a/.sh_history
+++ b/.sh_history
@@ -176,3 +176,8 @@ ls case1
 ls
 cd case2
 ls
+cd ..
+ls
+cd case3
+ls
+rm 062725_c3_title_cleaned.csv   
diff --git a/p2_EDA/062325_EDA.R b/p2_EDA/062325_EDA.R
index c85a58d..d24d37d 100644
--- a/p2_EDA/062325_EDA.R
+++ b/p2_EDA/062325_EDA.R
@@ -3,10 +3,10 @@ library(tidyverse)
 c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
 c1_input_df <- read.csv(c1_count , header = TRUE) 
 
-c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
 c2_input_df <- read.csv(c2_count , header = TRUE) 
 
-c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 
 
 library(dplyr)
@@ -88,7 +88,7 @@ combined_task_df <- combined_task_df %>%
   ungroup()
 
 # 3. Plot (e.g., bar plot of mean modal verbs per group)
-ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) +
+ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) +
   geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
   stat_summary(
     fun = mean,
diff --git a/p2_EDA/clean_c2c3_phab.R b/p2_EDA/clean_c2c3_phab.R
index 43e2518..db99f53 100644
--- a/p2_EDA/clean_c2c3_phab.R
+++ b/p2_EDA/clean_c2c3_phab.R
@@ -65,10 +65,12 @@ write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/ca
 # C2: after 9-3-2011 before 11-27-2013 
 c2_desc_flags <- c2_input_df %>%
   filter(comment_type == "task_description") %>%
-  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(http_flag = sapply(task_title, http_relevant)) |>
   mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |>
   select(TaskPHID, http_flag, time_flag)
 
+#mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+
 c2_flagged <- c2_input_df %>%
   left_join(c2_desc_flags, by = "TaskPHID")
 
@@ -77,11 +79,11 @@ c2_sampled <- c2_flagged |>
   filter(time_flag == TRUE)
 sum(c2_sampled$comment_type == "task_description")
 
-write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE)
+write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE)
 # C3: after 07-01-2013 before 10-01-2015
 c3_desc_flags <- c3_input_df %>%
   filter(comment_type == "task_description") %>%
-  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(http_flag = sapply(task_title, http_relevant)) |>
   mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |>
   select(TaskPHID, http_flag, time_flag)
 
@@ -93,4 +95,4 @@ c3_sampled <- c3_flagged |>
   filter(time_flag == TRUE)
 sum(c3_sampled$comment_type == "task_description")
 
-write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE)
+write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE)
diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R
index d059947..60632c6 100644
--- a/p2_EDA/phab_weekly_bins.R
+++ b/p2_EDA/phab_weekly_bins.R
@@ -3,12 +3,13 @@ library(tidyverse)
 c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
 c1_input_df <- read.csv(c1_count , header = TRUE) 
 
-c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
 c2_input_df <- read.csv(c2_count , header = TRUE) 
 
-c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 
 
+
 #getting the relative weeks to the publication date
 relative_week <- function(date, ref_date) {
   as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
@@ -138,7 +139,7 @@ combined_task_df <- combined_task_df %>%
   ) %>%
   ungroup()
 library(ggdist)
-ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) +
+ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
   geom_point(alpha=0.1) +             # Points, with some transparency
   geom_smooth(method = "loess", se = FALSE) + 
   theme_minimal()