updating with new sampling approach for c2 and c3

2025-07-01 07:45:13 -07:00 · 2025-07-01 07:45:13 -07:00 · a4d8685c13
commit a4d8685c13
parent edcb174d42
4 changed files with 18 additions and 10 deletions
--- a/.sh_history
+++ b/.sh_history
@ -176,3 +176,8 @@ ls case1
 ls
 cd case2
 ls
+cd ..
+ls
+cd case3
+ls
+rm 062725_c3_title_cleaned.csv   
--- a/p2_EDA/062325_EDA.R
+++ b/p2_EDA/062325_EDA.R
@ -3,10 +3,10 @@ library(tidyverse)
 c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
 c1_input_df <- read.csv(c1_count , header = TRUE) 

-c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
 c2_input_df <- read.csv(c2_count , header = TRUE) 

-c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 

 library(dplyr)
@ -88,7 +88,7 @@ combined_task_df <- combined_task_df %>%
  ungroup()

 # 3. Plot (e.g., bar plot of mean modal verbs per group)
-ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) +
+ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) +
  geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
  stat_summary(
    fun = mean,
--- a/p2_EDA/clean_c2c3_phab.R
+++ b/p2_EDA/clean_c2c3_phab.R
@ -65,10 +65,12 @@ write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/ca
 # C2: after 9-3-2011 before 11-27-2013 
 c2_desc_flags <- c2_input_df %>%
  filter(comment_type == "task_description") %>%
-  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(http_flag = sapply(task_title, http_relevant)) |>
  mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |>
  select(TaskPHID, http_flag, time_flag)

+#mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+
 c2_flagged <- c2_input_df %>%
  left_join(c2_desc_flags, by = "TaskPHID")

@ -77,11 +79,11 @@ c2_sampled <- c2_flagged |>
  filter(time_flag == TRUE)
 sum(c2_sampled$comment_type == "task_description")

-write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE)
+write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE)
 # C3: after 07-01-2013 before 10-01-2015
 c3_desc_flags <- c3_input_df %>%
  filter(comment_type == "task_description") %>%
-  mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
+  mutate(http_flag = sapply(task_title, http_relevant)) |>
  mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |>
  select(TaskPHID, http_flag, time_flag)

@ -93,4 +95,4 @@ c3_sampled <- c3_flagged |>
  filter(time_flag == TRUE)
 sum(c3_sampled$comment_type == "task_description")

-write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE)
+write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE)
--- a/p2_EDA/phab_weekly_bins.R
+++ b/p2_EDA/phab_weekly_bins.R
@ -3,12 +3,13 @@ library(tidyverse)
 c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
 c1_input_df <- read.csv(c1_count , header = TRUE) 

-c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
+c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
 c2_input_df <- read.csv(c2_count , header = TRUE) 

-c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
+c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
 c3_input_df <- read.csv(c3_count , header = TRUE) 

+
 #getting the relative weeks to the publication date
 relative_week <- function(date, ref_date) {
  as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
@ -138,7 +139,7 @@ combined_task_df <- combined_task_df %>%
  ) %>%
  ungroup()
 library(ggdist)
-ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) +
+ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
  geom_point(alpha=0.1) +             # Points, with some transparency
  geom_smooth(method = "loess", se = FALSE) + 
  theme_minimal()