updating with new sampling approach for c2 and c3
This commit is contained in:
parent
edcb174d42
commit
a4d8685c13
@ -176,3 +176,8 @@ ls case1
|
||||
ls
|
||||
cd case2
|
||||
ls
|
||||
cd ..
|
||||
ls
|
||||
cd case3
|
||||
ls
|
||||
rm 062725_c3_title_cleaned.csv
|
||||
|
@ -3,10 +3,10 @@ library(tidyverse)
|
||||
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
|
||||
c1_input_df <- read.csv(c1_count , header = TRUE)
|
||||
|
||||
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
|
||||
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
|
||||
c2_input_df <- read.csv(c2_count , header = TRUE)
|
||||
|
||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
|
||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
|
||||
c3_input_df <- read.csv(c3_count , header = TRUE)
|
||||
|
||||
library(dplyr)
|
||||
@ -88,7 +88,7 @@ combined_task_df <- combined_task_df %>%
|
||||
ungroup()
|
||||
|
||||
# 3. Plot (e.g., bar plot of mean modal verbs per group)
|
||||
ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) +
|
||||
ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) +
|
||||
geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
|
||||
stat_summary(
|
||||
fun = mean,
|
||||
|
@ -65,10 +65,12 @@ write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/ca
|
||||
# C2: after 9-3-2011 before 11-27-2013
|
||||
c2_desc_flags <- c2_input_df %>%
|
||||
filter(comment_type == "task_description") %>%
|
||||
mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
|
||||
mutate(http_flag = sapply(task_title, http_relevant)) |>
|
||||
mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |>
|
||||
select(TaskPHID, http_flag, time_flag)
|
||||
|
||||
#mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
|
||||
|
||||
c2_flagged <- c2_input_df %>%
|
||||
left_join(c2_desc_flags, by = "TaskPHID")
|
||||
|
||||
@ -77,11 +79,11 @@ c2_sampled <- c2_flagged |>
|
||||
filter(time_flag == TRUE)
|
||||
sum(c2_sampled$comment_type == "task_description")
|
||||
|
||||
write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE)
|
||||
write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE)
|
||||
# C3: after 07-01-2013 before 10-01-2015
|
||||
c3_desc_flags <- c3_input_df %>%
|
||||
filter(comment_type == "task_description") %>%
|
||||
mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>%
|
||||
mutate(http_flag = sapply(task_title, http_relevant)) |>
|
||||
mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |>
|
||||
select(TaskPHID, http_flag, time_flag)
|
||||
|
||||
@ -93,4 +95,4 @@ c3_sampled <- c3_flagged |>
|
||||
filter(time_flag == TRUE)
|
||||
sum(c3_sampled$comment_type == "task_description")
|
||||
|
||||
write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE)
|
||||
write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE)
|
||||
|
@ -3,12 +3,13 @@ library(tidyverse)
|
||||
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
|
||||
c1_input_df <- read.csv(c1_count , header = TRUE)
|
||||
|
||||
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv"
|
||||
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
|
||||
c2_input_df <- read.csv(c2_count , header = TRUE)
|
||||
|
||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
|
||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
|
||||
c3_input_df <- read.csv(c3_count , header = TRUE)
|
||||
|
||||
|
||||
#getting the relative weeks to the publication date
|
||||
relative_week <- function(date, ref_date) {
|
||||
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
|
||||
@ -138,7 +139,7 @@ combined_task_df <- combined_task_df %>%
|
||||
) %>%
|
||||
ungroup()
|
||||
library(ggdist)
|
||||
ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) +
|
||||
ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
|
||||
geom_point(alpha=0.1) + # Points, with some transparency
|
||||
geom_smooth(method = "loess", se = FALSE) +
|
||||
theme_minimal()
|
||||
|
Loading…
Reference in New Issue
Block a user