backup, trying to parse text
This commit is contained in:
		
							parent
							
								
									7f8b885ef4
								
							
						
					
					
						commit
						9c6034ca30
					
				| @ -13,3 +13,7 @@ ls | |||||||
| rm event_0215_ve_weekly_commit_count_data.csv | rm event_0215_ve_weekly_commit_count_data.csv | ||||||
| rm announcement_0215_ve_weekly_commit_count_data.csv | rm announcement_0215_ve_weekly_commit_count_data.csv | ||||||
| ls | ls | ||||||
|  | cd ~ | ||||||
|  | ls | ||||||
|  | ls .local | ||||||
|  | rm -r -f .local | ||||||
|  | |||||||
							
								
								
									
										4
									
								
								.wget-hsts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.wget-hsts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,4 @@ | |||||||
|  | # HSTS 1.0 Known Hosts database for GNU Wget. | ||||||
|  | # Edit at your own risk. | ||||||
|  | # <hostname>	<port>	<incl. subdomains>	<created>	<max-age> | ||||||
|  | www.python.org	0	1	1740094792	63072000 | ||||||
| @ -60,7 +60,7 @@ count_var_to_mean <- count_var / count_mean #2262140.471 | |||||||
| 
 | 
 | ||||||
| # the mean count values for each day | # the mean count values for each day | ||||||
| wiki_summary_df <- combined_df |> | wiki_summary_df <- combined_df |> | ||||||
|   filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |> |   filter(date >= as.Date("2012-12-01") & date <= as.Date("2013-09-30")) |> | ||||||
|   filter(wiki_db == "enwiki") |> |   filter(wiki_db == "enwiki") |> | ||||||
|   group_by(date) |> |   group_by(date) |> | ||||||
|   summarize( |   summarize( | ||||||
| @ -70,8 +70,10 @@ wiki_summary_df <- combined_df |> | |||||||
| #plotting it | #plotting it | ||||||
| p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) + | p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) + | ||||||
|   geom_line(color = "blue") +     # Line plot    # Points on the line |   geom_line(color = "blue") +     # Line plot    # Points on the line | ||||||
|   geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + |   geom_vline(xintercept = as.Date("2013-07-02"), linetype = "dashed", color = "black") + | ||||||
|   labs(title = "enwiki Total Bot Actions", |   geom_vline(xintercept = as.Date("2013-04-28"), linetype = "dashed", color = "black") + | ||||||
|  |   geom_vline(xintercept = as.Date("2012-12-12"), linetype = "dashed", color = "black") + | ||||||
|  |   labs(title = "enwiki Bot Actions 2012-12 to 2013-10", | ||||||
|        x = "Date (daily)", |        x = "Date (daily)", | ||||||
|        y = "Action Count") + |        y = "Action Count") + | ||||||
|   theme_minimal()     |   theme_minimal()     | ||||||
|  | |||||||
| @ -3,7 +3,7 @@ library(dplyr) | |||||||
| library(lubridate) | library(lubridate) | ||||||
| library(tidyr) | library(tidyr) | ||||||
| 
 | 
 | ||||||
| ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/mediawiki_core_commits.csv" | ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/extensions_visualeditor_commits.csv" | ||||||
| 
 | 
 | ||||||
| transform_commit_data <- function(filepath){ | transform_commit_data <- function(filepath){ | ||||||
|   #basic, loading in the file  |   #basic, loading in the file  | ||||||
| @ -14,8 +14,9 @@ transform_commit_data <- function(filepath){ | |||||||
| 
 | 
 | ||||||
|    |    | ||||||
|   # TODO: this is project/event specific |   # TODO: this is project/event specific | ||||||
|   #event_date <- as.Date("2013-07-01") |   event_date <- as.Date("2013-07-01") | ||||||
|   event_date <- as.Date("2013-06-06") |   #event_date <- as.Date("2013-04-28") | ||||||
|  |   #event_date <- as.Date("2012-12-12") | ||||||
|    |    | ||||||
|   # isolate project id |   # isolate project id | ||||||
|   project_id <- sub("_.*$", "", file_name) |   project_id <- sub("_.*$", "", file_name) | ||||||
| @ -68,7 +69,8 @@ transform_commit_data <- function(filepath){ | |||||||
|    |    | ||||||
|   #now cut out the commit data that we don't care about  |   #now cut out the commit data that we don't care about  | ||||||
|   df <- df |> |   df <- df |> | ||||||
|     filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |     filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |> | ||||||
|  |     filter(author_email != "jenkins-bot@gerrit.wikimedia.org") | ||||||
|    |    | ||||||
|   #in order: |   #in order: | ||||||
|   # - we group by project, week, ages |   # - we group by project, week, ages | ||||||
| @ -85,7 +87,6 @@ transform_commit_data <- function(filepath){ | |||||||
|               wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), |               wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), | ||||||
|               wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), |               wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), | ||||||
|               bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), |               bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), | ||||||
|               jenkins_commit_count = sum(grepl("jenkins-bot@gerrit.wikimedia.org|gerrit@wikimedia.org", author_email)), |  | ||||||
|               .groups = 'drop') |> |               .groups = 'drop') |> | ||||||
|     right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> |     right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> | ||||||
|     replace_na(list(commit_count = 0)) |> |     replace_na(list(commit_count = 0)) |> | ||||||
| @ -131,7 +132,7 @@ transform_commit_data <- function(filepath){ | |||||||
| 
 | 
 | ||||||
| test <- read.csv(ve_commit_fp, header = TRUE)  | test <- read.csv(ve_commit_fp, header = TRUE)  | ||||||
| transformed <- transform_commit_data(ve_commit_fp) | transformed <- transform_commit_data(ve_commit_fp) | ||||||
| output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/announcement_0215_core_weekly_commit_count_data.csv" | output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
| 
 | 
 | ||||||
| write.csv(transformed, output_filepath, row.names = FALSE) | write.csv(transformed, output_filepath, row.names = FALSE) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,19 +1,24 @@ | |||||||
| library(tidyverse) | library(tidyverse) | ||||||
| count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_ve_weekly_commit_count_data.csv" | count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
| input_df <- read.csv(count_data_fp, header = TRUE)  | input_df <- read.csv(count_data_fp, header = TRUE)  | ||||||
| 
 | 
 | ||||||
| input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count | input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count | ||||||
| 
 | 
 | ||||||
| window_num <- 19 | window_num <- 52 | ||||||
| input_df <- input_df |> | input_df <- input_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> | ||||||
|  |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
|  |   mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> | ||||||
|  |   mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> | ||||||
|  |   select(-mediawiki_dev_commit_count) |> | ||||||
|  |   select(-wikia_commit_count) | ||||||
| 
 | 
 | ||||||
| library(scales) | library(scales) | ||||||
| library(ggplot2) | library(ggplot2) | ||||||
| 
 | 
 | ||||||
| time_plot <- input_df |> | time_plot <- input_df |> | ||||||
|   ggplot(aes(x=relative_week, y=wikimedia_commit_count)) + |   ggplot(aes(x=relative_week, y=nonbot_commit_count)) + | ||||||
|   labs(x="Weekly Offset", y="Wikimedia Commit Count") + |   labs(x="Weekly Offset", y="Nonbot Commit Count") + | ||||||
|   geom_smooth() + |   geom_smooth() + | ||||||
|   geom_vline(xintercept = 0)+ |   geom_vline(xintercept = 0)+ | ||||||
|   theme_bw() + |   theme_bw() + | ||||||
| @ -24,22 +29,24 @@ library(dplyr) | |||||||
| 
 | 
 | ||||||
| share_df <- input_df |> | share_df <- input_df |> | ||||||
|   mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> |   mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> | ||||||
|   mutate(wikia_share = wikia_commit_count / nonbot_commit_count) |> |   mutate(other_share = other_commit_count / nonbot_commit_count)|> | ||||||
|   mutate(gerrit_share = jenkins_commit_count / nonbot_commit_count) |>  |  | ||||||
|   mutate(mw_dev_share = mediawiki_dev_commit_count / nonbot_commit_count) |> |  | ||||||
|   mutate(other_share = (nonbot_commit_count - jenkins_commit_count - wikia_commit_count - wikimedia_commit_count - mediawiki_dev_commit_count) / nonbot_commit_count)|> |  | ||||||
|   drop_na() |   drop_na() | ||||||
| 
 | 
 | ||||||
| share_long <- share_df |> | share_long <- share_df |> | ||||||
|   dplyr::select(relative_week, wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share) |> |   dplyr::select(relative_week, wikimedia_share, other_share) |> | ||||||
|   pivot_longer(cols = c(wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share), names_to = "category", values_to = "share") |   pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share") | ||||||
| 
 | 
 | ||||||
| share_plot <- share_long |> | share_plot <- share_long |> | ||||||
|   ggplot(aes(x=relative_week, y=share, color=category)) + |   ggplot(aes(x=relative_week, y=share, color=category)) + | ||||||
|   geom_smooth() + |   geom_line() + | ||||||
|   geom_vline(xintercept = 0)+ |   geom_vline(xintercept = 0)+ | ||||||
|  |   annotate("text", x = -7, y=1, label = "2012-12-12") + | ||||||
|  |   geom_vline(xintercept = 19)+ | ||||||
|  |   annotate("text", x = 12, y=1, label = "2013-04-28") + | ||||||
|  |   geom_vline(xintercept = 28)+ | ||||||
|  |   annotate("text", x = 35, y=1, label = "2013-07-01") + | ||||||
|   labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") + |   labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") + | ||||||
|   ggtitle("Weekly Share of Nonbot Commit Count by Affiliation") + |   ggtitle("VE Weekly Share of Nonbot Commit Count by Affiliation (enwiki opt-in testing 2012-12-12)") + | ||||||
|   theme_bw() +  |   theme_bw() +  | ||||||
|   theme(legend.position = "top") |   theme(legend.position = "top") | ||||||
| share_plot | share_plot | ||||||
|  | |||||||
							
								
								
									
										132
									
								
								commit_analysis/matched_rdd_models.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								commit_analysis/matched_rdd_models.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,132 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | ||||||
|  | 
 | ||||||
|  | widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | ||||||
|  | 
 | ||||||
|  | event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||||
|  | 
 | ||||||
|  | input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||||
|  | #input_df <- bind_rows(entest_df, widetest_df) | ||||||
|  | 
 | ||||||
|  | input_df <- input_df |> | ||||||
|  |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
|  |   mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> | ||||||
|  |   mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> | ||||||
|  |   dplyr::select(-mediawiki_dev_commit_count) |> | ||||||
|  |   dplyr::select(-wikia_commit_count) | ||||||
|  | 
 | ||||||
|  | library(MASS) | ||||||
|  | 
 | ||||||
|  | library(lme4) | ||||||
|  | library(dplyr) | ||||||
|  | 
 | ||||||
|  | #get into mlm format | ||||||
|  | long_df <- input_df |> | ||||||
|  |   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), | ||||||
|  |                names_to = "commit_type", | ||||||
|  |                values_to = "lengthened_commit_count") | ||||||
|  | 
 | ||||||
|  | intermediate_long_df <- long_df |> | ||||||
|  |   mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | ||||||
|  |   mutate(log_commits = log1p(lengthened_commit_count))|> | ||||||
|  |   mutate(scaled_long_commits = lengthened_commit_count / 10)  | ||||||
|  | 
 | ||||||
|  | library(rdd) | ||||||
|  | 
 | ||||||
|  | intermediate_long_df <- intermediate_long_df |> | ||||||
|  |   drop_na() | ||||||
|  | 
 | ||||||
|  | var(intermediate_long_df$commit_share) # 1253.343 | ||||||
|  | mean(intermediate_long_df$commit_share) # 44.92381 | ||||||
|  | median(intermediate_long_df$commit_share) # 39.5 | ||||||
|  | 
 | ||||||
|  | get_optimal_bandwidth <- function(df){ | ||||||
|  |   bw <- tryCatch({ | ||||||
|  |     IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") | ||||||
|  |   }, error = function(e) { | ||||||
|  |     NA | ||||||
|  |   }) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | window_num <- 18 | ||||||
|  | final_long_df <- intermediate_long_df |> | ||||||
|  |   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||||
|  | 
 | ||||||
|  | library(fitdistrplus) | ||||||
|  | descdist(long_df$lengthened_commit_count, discrete=FALSE) | ||||||
|  | #start_values <- list(shape1 = 1, shape2 = 1) | ||||||
|  | #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") | ||||||
|  | print(fit) | ||||||
|  | 
 | ||||||
|  | #NOTE should not run if you've already dropped NA | ||||||
|  | mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + | ||||||
|  |                   (before_after*relative_week|commit_type) +  | ||||||
|  |                   (before_after*relative_week|rd_event), | ||||||
|  |                 control=glmerControl(optimizer="bobyqa", | ||||||
|  |                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||||
|  |                 data=long_df) | ||||||
|  | #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ | ||||||
|  | #                    (before_after*relative_week|commit_type) +  | ||||||
|  | #                    (before_after*relative_week|rd_event) ,data=long_df) | ||||||
|  | summary(mlm) | ||||||
|  | qqnorm(residuals(mlm)) | ||||||
|  | res <- ranef(mlm) | ||||||
|  | print(res) | ||||||
|  | 
 | ||||||
|  | #final_long_df <- final_long_df |> | ||||||
|  | #  drop_na() | ||||||
|  | library(performance) | ||||||
|  | #descdist(long_df$commit_share, discrete=FALSE) | ||||||
|  | #fit <- MASS::fitdistr(as.numeric(long_df$commit_share), "normal") | ||||||
|  | #print(fit) | ||||||
|  | wikimedia_long_df <- final_long_df |> | ||||||
|  |   filter(commit_type == "wikimedia_commit_count") | ||||||
|  | wikimedia_share_lmer <- lmer(commit_share ~ before_after*relative_week + | ||||||
|  |                                (1| rd_event), | ||||||
|  |                   data=wikimedia_long_df) | ||||||
|  | summary(wikimedia_share_lmer) | ||||||
|  | icc(wikimedia_share_lmer) | ||||||
|  | 
 | ||||||
|  | other_long_df <- final_long_df |> | ||||||
|  |   filter(commit_type == "other_commit_count") | ||||||
|  | other_share_lmer <- lm(commit_share ~ before_after*relative_week, | ||||||
|  |                            data=other_long_df) | ||||||
|  | summary(other_share_lmer) | ||||||
|  | icc(other_share_lmer) | ||||||
|  | 
 | ||||||
|  | #power analysis  | ||||||
|  | #library(simr) | ||||||
|  | #simrOptions(progress=FALSE) | ||||||
|  | 
 | ||||||
|  | ## Intercept and slopes for intervention, time1, time2, intervention:time1, intervention:time2 | ||||||
|  | #wmf_fixed <- c(0.511, -0.166, 0.002, 0.007) | ||||||
|  | ## Random intercepts for participants clustered by class | ||||||
|  | #wmf_rand <- matrix(c( | ||||||
|  | #  0.01,  0.005, 0.002, 0.001, | ||||||
|  | #  0.005, 0.02,  0.003, 0.004, | ||||||
|  | #  0.002, 0.003, 0.015, 0.006, | ||||||
|  | #  0.001, 0.004, 0.006, 0.01 | ||||||
|  | #), nrow=4, byrow=TRUE) | ||||||
|  | ## residual variance | ||||||
|  | #wmf_res <- 0.2065 | ||||||
|  | 
 | ||||||
|  | #wmf_model <- makeLmer(commit_share ~ before_after*relative_week + (before_after*relative_week | rd_event), | ||||||
|  | #                      fixef=wmf_fixed, VarCorr=wmf_rand, sigma=wmf_res, data=wikimedia_long_df) | ||||||
|  | 
 | ||||||
|  | #sim_treat <- powerSim(wmf_model, nsim=100, test = fcompare(commit_share~relative_week)) | ||||||
|  | #sim_treat | ||||||
|  | 
 | ||||||
|  | #model_ext_subj <- extend(wmf_model, within="rd_event+before_after+relative_week", n=30) | ||||||
|  | #sim_treat_subj <- powerSim(model_ext_subj, nsim=100, test = fcompare(commit_share~before_after*relative_week)) | ||||||
|  | #sim_treat_subj | ||||||
|  | 
 | ||||||
|  | #p_curve_treat <- powerCurve(model_ext_subj, test=fcompare(commit_share~before_after*relative_week),  | ||||||
|  | #                            within="rd_event+before_after+relative_week",  | ||||||
|  | #                            breaks=c(5,10,15,20)) | ||||||
|  | #plot(p_curve_treat) | ||||||
| @ -1,6 +1,6 @@ | |||||||
| library(tidyverse) | library(tidyverse) | ||||||
| count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_core_weekly_commit_count_data.csv" | count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
| input_df <- read.csv(count_data_fp, header = TRUE)  | input_df <- read.csv(count_data_fp, header = TRUE) | ||||||
| 
 | 
 | ||||||
| library(rdd) | library(rdd) | ||||||
| 
 | 
 | ||||||
| @ -18,34 +18,53 @@ get_optimal_bandwidth <- function(df){ | |||||||
| 
 | 
 | ||||||
| optimal_bandwidth <- get_optimal_bandwidth(input_df) | optimal_bandwidth <- get_optimal_bandwidth(input_df) | ||||||
| 
 | 
 | ||||||
| window_num <- 13 | window_num <- 8 | ||||||
| input_df <- input_df |> | input_df <- input_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> |   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> | ||||||
|   mutate(other_commit_count = commit_count - bot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count - jenkins_commit_count) |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
|  |   mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> | ||||||
|  |   mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> | ||||||
|  |   select(-mediawiki_dev_commit_count) |> | ||||||
|  |   select(-wikia_commit_count) | ||||||
| 
 | 
 | ||||||
| library(MASS) | #library(MASS) | ||||||
| 
 | 
 | ||||||
| simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df) | #simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df) | ||||||
| summary(simple_model) | #summary(simple_model) | ||||||
| 
 | 
 | ||||||
| library(lme4) | library(lme4) | ||||||
| library(dplyr) | library(dplyr) | ||||||
| 
 | 
 | ||||||
| #get into mlm format | #get into mlm format | ||||||
| long_df <- input_df |> | long_df <- input_df |> | ||||||
|   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, jenkins_commit_count, wikia_commit_count, mediawiki_dev_commit_count), |   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), | ||||||
|                names_to = "commit_type", |                names_to = "commit_type", | ||||||
|                values_to = "lengthened_commit_count") |                values_to = "lengthened_commit_count") | ||||||
| 
 | 
 | ||||||
| long_df <- long_df |> | long_df <- long_df |> | ||||||
|   mutate(commit_share = lengthened_commit_count / (commit_count - bot_commit_count)) |> |   mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | ||||||
|   mutate(log_commits = log1p(lengthened_commit_count)) |   mutate(log_commits = log1p(lengthened_commit_count)) | ||||||
| 
 | 
 | ||||||
| mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + (before_after*relative_week|commit_type), | mlm <- glmer.nb(log_commits ~ before_after*relative_week + (before_after*relative_week|commit_type), | ||||||
|                 control=glmerControl(optimizer="bobyqa", |                 control=glmerControl(optimizer="bobyqa", | ||||||
|                                      optCtrl=list(maxfun=2e5)), nAGQ=0, |                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||||
|                 data=long_df) |                 data=long_df) | ||||||
| summary(mlm) | summary(mlm) | ||||||
| ranefs <- ranef(mlm) | ranefs <- ranef(mlm) | ||||||
| print(ranefs) | print(ranefs) | ||||||
| saveRDS(mlm, "021525_core-ve_event_mlm.rda") | #saveRDS(mlm, "021525_core-ve_event_mlm.rda") | ||||||
|  | 
 | ||||||
|  | share_df <- input_df |> | ||||||
|  |   mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> | ||||||
|  |   mutate(other_share = other_commit_count / nonbot_commit_count)|> | ||||||
|  |   drop_na() | ||||||
|  | 
 | ||||||
|  | share_long <- share_df |> | ||||||
|  |   dplyr::select(relative_week, wikimedia_share, other_share, before_after) |> | ||||||
|  |   pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share") | ||||||
|  | 
 | ||||||
|  | share_mlm <- glmer.nb(share ~ before_after*relative_week + (before_after*relative_week|category), | ||||||
|  |                 control=glmerControl(optimizer="bobyqa", | ||||||
|  |                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||||
|  |                 data=share_long) | ||||||
|  | summary(share_mlm) | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								english-ewt-ud-2.5-191206.udpipe
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								english-ewt-ud-2.5-191206.udpipe
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -4,11 +4,10 @@ input_df <- read.csv(count_data_fp, header = TRUE) | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| #window_num <- 19 | #window_num <- 19 | ||||||
| window_num <- 52 | window_num <- 26 | ||||||
| input_df <- input_df |> | input_df <- input_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> |   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> | ||||||
|   mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor",  |   mutate(parent_projects = project) | ||||||
|                                                        "mediawiki/extensions", project)) |  | ||||||
| 
 | 
 | ||||||
| library(scales) | library(scales) | ||||||
| library(ggplot2) | library(ggplot2) | ||||||
| @ -17,6 +16,7 @@ time_plot <- input_df |> | |||||||
|   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + |   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + | ||||||
|   labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") + |   labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") + | ||||||
|   geom_smooth() + |   geom_smooth() + | ||||||
|  |   geom_point() + | ||||||
|   geom_vline(xintercept = 0)+ |   geom_vline(xintercept = 0)+ | ||||||
|   theme_bw() + |   theme_bw() + | ||||||
|   theme(legend.position = "top") |   theme(legend.position = "top") | ||||||
| @ -28,7 +28,8 @@ abandoned_df <- input_df |> | |||||||
| time_plot <- abandoned_df |> | time_plot <- abandoned_df |> | ||||||
|   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + |   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + | ||||||
|   labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") + |   labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") + | ||||||
|   geom_line() + |   geom_smooth() + | ||||||
|  |   geom_point() + | ||||||
|   geom_vline(xintercept = 0)+ |   geom_vline(xintercept = 0)+ | ||||||
|   theme_bw() + |   theme_bw() + | ||||||
|   theme(legend.position = "top") |   theme(legend.position = "top") | ||||||
| @ -37,7 +38,8 @@ time_plot | |||||||
| 
 | 
 | ||||||
| delta_df <- input_df |> | delta_df <- input_df |> | ||||||
|   filter(task_count != 0) |> |   filter(task_count != 0) |> | ||||||
|   filter(relative_week >= (- 12))  |   filter(relative_week >= (- 12)) |>  | ||||||
|  |   filter(status != "ABANDONED") | ||||||
| time_plot <- delta_df |> | time_plot <- delta_df |> | ||||||
|   ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) + |   ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) + | ||||||
|   labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") + |   labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") + | ||||||
| @ -49,11 +51,12 @@ time_plot | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| loc_df <- input_df |> | loc_df <- input_df |> | ||||||
|  |   filter(relative_week >= (- 19) & relative_week <= (19)) |> | ||||||
|   filter(task_count != 0) |> |   filter(task_count != 0) |> | ||||||
|   filter(status != "ABANDONED") |   filter(status != "ABANDONED") | ||||||
| time_plot <- loc_df |> | time_plot <- loc_df |> | ||||||
|   ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) + |   ggplot(aes(x=relative_week, y=avg_insertions, color=parent_projects)) + | ||||||
|   labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") + |   labs(x="Weekly Offset", y="Avg. LOC Inserted per Accepted Gerrit Task", color = "Project") + | ||||||
|   geom_line() + |   geom_line() + | ||||||
|   geom_vline(xintercept = 0)+ |   geom_vline(xintercept = 0)+ | ||||||
|   theme_bw() + |   theme_bw() + | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								mgaughan-rstudio-server_24372601.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								mgaughan-rstudio-server_24372601.out
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | |||||||
|  | 1. SSH tunnel from your workstation using the following command: | ||||||
|  | 
 | ||||||
|  |    ssh -N -L 8787:n3441:34585 mjilg@klone.hyak.uw.edu | ||||||
|  | 
 | ||||||
|  |    and point your web browser to http://localhost:8787 | ||||||
|  | 
 | ||||||
|  | 2. log in to RStudio Server using the following credentials: | ||||||
|  | 
 | ||||||
|  |    user: mjilg | ||||||
|  |    password: WoborOUQ79MgRq898+pw | ||||||
|  | 
 | ||||||
|  | When done using RStudio Server, terminate the job by: | ||||||
|  | 
 | ||||||
|  | 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) | ||||||
|  | 2. Issue the following command on the login node: | ||||||
|  | 
 | ||||||
|  |       scancel -f 24372601 | ||||||
|  | slurmstepd: error: *** JOB 24372601 ON n3441 CANCELLED AT 2025-02-20T15:43:50 *** | ||||||
| @ -1,15 +1,48 @@ | |||||||
| library(dplyr) | library(dplyr) | ||||||
| library(ggplot2) | library(ggplot2) | ||||||
| phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/0205_convo_data/phab_data/visualeditor/0205_ve_phab_comments.csv" | phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv" | ||||||
| phab_data <- read.csv(phab_data_path, header=TRUE) | phab_data <- read.csv(phab_data_path, header=TRUE) | ||||||
| 
 | 
 | ||||||
| phab_data <- phab_data |> | phab_data <- phab_data |> | ||||||
|   mutate(has_ref = grepl("bots", comment_text)) |> |   mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |> | ||||||
|  |   mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |> | ||||||
|   mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> |   mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> | ||||||
|   filter(date_created < 1518232866 & date_created > 1356923678) |   mutate(comment_id = row_number())|> | ||||||
|  |   filter(date_created < 1383264000 & date_created > 1351728000) | ||||||
|  | #looking at all data between 11-1-2012 and 11-1-2013 | ||||||
|  | 
 | ||||||
|  | length(unique(phab_data$date_created)) | ||||||
|  | 
 | ||||||
|  | #g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) + | ||||||
|  | #  geom_point(alpha = 0.5) +  | ||||||
|  | #  theme_minimal() | ||||||
|  | #g | ||||||
|  | 
 | ||||||
|  | library(udpipe) | ||||||
|  | #library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file | ||||||
|  | 
 | ||||||
|  | library(tidytext) | ||||||
|  | library(dplyr) | ||||||
|  | library(stringr) | ||||||
|  | 
 | ||||||
|  | # we first need to transform our comment level of analysis into sentences | ||||||
|  | sentence_level_data <- phab_data |> | ||||||
|  |   unnest_tokens(sentence, comment_text, token = "sentences") |> | ||||||
|  |   group_by(comment_id) |> | ||||||
|  |   mutate(sentence_id = row_number())|> | ||||||
|  |   dplyr::select(-has_bot_ref, -has_ref)|> | ||||||
|  |   mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |> | ||||||
|  |   mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |> | ||||||
|  |   ungroup() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| g <- ggplot(phab_data, aes(x=timestamp, y=has_ref)) + | library(udpipe) | ||||||
|   geom_point(alpha = 0.5) +  | library(rsyntax) | ||||||
|   theme_minimal() | # Load necessary libraries | ||||||
| g | library(spacyr) | ||||||
|  | spacy_install() | ||||||
|  | #we only care about stuff that mentions VE rn, then tokenize | ||||||
|  | sentence_level_data <- sentence_level_data |> | ||||||
|  |   filter(has_ref == TRUE) |> | ||||||
|  |   mutate(sentence_tokens = udpipe(sentence, "english")) | ||||||
|  |    | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user