updating with new sampling approach for c2 and c3
This commit is contained in:
		
							parent
							
								
									edcb174d42
								
							
						
					
					
						commit
						a4d8685c13
					
				| @ -176,3 +176,8 @@ ls case1 | ||||
| ls | ||||
| cd case2 | ||||
| ls | ||||
| cd .. | ||||
| ls | ||||
| cd case3 | ||||
| ls | ||||
| rm 062725_c3_title_cleaned.csv    | ||||
|  | ||||
| @ -3,10 +3,10 @@ library(tidyverse) | ||||
| c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" | ||||
| c1_input_df <- read.csv(c1_count , header = TRUE)  | ||||
| 
 | ||||
| c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" | ||||
| c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv" | ||||
| c2_input_df <- read.csv(c2_count , header = TRUE)  | ||||
| 
 | ||||
| c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" | ||||
| c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv" | ||||
| c3_input_df <- read.csv(c3_count , header = TRUE)  | ||||
| 
 | ||||
| library(dplyr) | ||||
| @ -88,7 +88,7 @@ combined_task_df <- combined_task_df %>% | ||||
|   ungroup() | ||||
| 
 | ||||
| # 3. Plot (e.g., bar plot of mean modal verbs per group) | ||||
| ggplot(combined_task_df, aes(x = source, y = whatever_subset_count, fill = AuthorWMFAffil)) + | ||||
| ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) + | ||||
|   geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + | ||||
|   stat_summary( | ||||
|     fun = mean, | ||||
|  | ||||
| @ -65,10 +65,12 @@ write.csv(c1_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/ca | ||||
| # C2: after 9-3-2011 before 11-27-2013  | ||||
| c2_desc_flags <- c2_input_df %>% | ||||
|   filter(comment_type == "task_description") %>% | ||||
|   mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% | ||||
|   mutate(http_flag = sapply(task_title, http_relevant)) |> | ||||
|   mutate(time_flag = date_created >= 1315008000 & date_created <= 1385596799) |> | ||||
|   select(TaskPHID, http_flag, time_flag) | ||||
| 
 | ||||
| #mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% | ||||
| 
 | ||||
| c2_flagged <- c2_input_df %>% | ||||
|   left_join(c2_desc_flags, by = "TaskPHID") | ||||
| 
 | ||||
| @ -77,11 +79,11 @@ c2_sampled <- c2_flagged |> | ||||
|   filter(time_flag == TRUE) | ||||
| sum(c2_sampled$comment_type == "task_description") | ||||
| 
 | ||||
| write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned.csv", row.names=FALSE) | ||||
| write.csv(c2_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv", row.names=FALSE) | ||||
| # C3: after 07-01-2013 before 10-01-2015 | ||||
| c3_desc_flags <- c3_input_df %>% | ||||
|   filter(comment_type == "task_description") %>% | ||||
|   mutate(http_flag = mapply(function(txt, ttl) http_relevant(txt) || http_relevant(ttl), comment_text, task_title)) %>% | ||||
|   mutate(http_flag = sapply(task_title, http_relevant)) |> | ||||
|   mutate(time_flag = date_created >= 1372636800 & date_created <= 1443743999) |> | ||||
|   select(TaskPHID, http_flag, time_flag) | ||||
| 
 | ||||
| @ -93,4 +95,4 @@ c3_sampled <- c3_flagged |> | ||||
|   filter(time_flag == TRUE) | ||||
| sum(c3_sampled$comment_type == "task_description") | ||||
| 
 | ||||
| write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned.csv", row.names=FALSE) | ||||
| write.csv(c3_sampled, "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv", row.names=FALSE) | ||||
|  | ||||
| @ -3,12 +3,13 @@ library(tidyverse) | ||||
| c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv" | ||||
| c1_input_df <- read.csv(c1_count , header = TRUE)  | ||||
| 
 | ||||
| c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/062725_c2_cleaned_phab.csv" | ||||
| c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv" | ||||
| c2_input_df <- read.csv(c2_count , header = TRUE)  | ||||
| 
 | ||||
| c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv" | ||||
| c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv" | ||||
| c3_input_df <- read.csv(c3_count , header = TRUE)  | ||||
| 
 | ||||
| 
 | ||||
| #getting the relative weeks to the publication date | ||||
| relative_week <- function(date, ref_date) { | ||||
|   as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) | ||||
| @ -138,7 +139,7 @@ combined_task_df <- combined_task_df %>% | ||||
|   ) %>% | ||||
|   ungroup() | ||||
| library(ggdist) | ||||
| ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) + | ||||
| ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) + | ||||
|   geom_point(alpha=0.1) +             # Points, with some transparency | ||||
|   geom_smooth(method = "loess", se = FALSE) +  | ||||
|   theme_minimal()       | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user