diff --git a/.sh_history b/.sh_history index 4a6ea5d..ec57dbd 100644 --- a/.sh_history +++ b/.sh_history @@ -8,3 +8,8 @@ cd .. ls ls commit_data ls commit_data/visualeditor +cd /mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/ +ls +rm event_0215_ve_weekly_commit_count_data.csv +rm announcement_0215_ve_weekly_commit_count_data.csv +ls diff --git a/commit_analysis/case1/021525_ve_event_mlm.rda b/commit_analysis/case1/021525_ve_event_mlm.rda new file mode 100644 index 0000000..28cd32b Binary files /dev/null and b/commit_analysis/case1/021525_ve_event_mlm.rda differ diff --git a/commit_analysis/case1/0215_commit_shares_GAM.png b/commit_analysis/case1/0215_commit_shares_GAM.png new file mode 100644 index 0000000..490e373 Binary files /dev/null and b/commit_analysis/case1/0215_commit_shares_GAM.png differ diff --git a/commit_analysis/case1/0215_jenkins_commits_GAM.png b/commit_analysis/case1/0215_jenkins_commits_GAM.png new file mode 100644 index 0000000..c0817f7 Binary files /dev/null and b/commit_analysis/case1/0215_jenkins_commits_GAM.png differ diff --git a/commit_analysis/case1/0215_nonbot_commits_GAM.png b/commit_analysis/case1/0215_nonbot_commits_GAM.png new file mode 100644 index 0000000..2ba31ce Binary files /dev/null and b/commit_analysis/case1/0215_nonbot_commits_GAM.png differ diff --git a/commit_analysis/case1/0215_wmf_commits_GAM.png b/commit_analysis/case1/0215_wmf_commits_GAM.png new file mode 100644 index 0000000..c6ebc1e Binary files /dev/null and b/commit_analysis/case1/0215_wmf_commits_GAM.png differ diff --git a/commit_analysis/commit_count_collation.R b/commit_analysis/commit_count_collation.R index c12d3b4..f1695a5 100644 --- a/commit_analysis/commit_count_collation.R +++ b/commit_analysis/commit_count_collation.R @@ -1,8 +1,9 @@ -library(tidyverse) +#library(tidyverse) library(dplyr) library(lubridate) +library(tidyr) -ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/VisualEditor_2012-01-01_to_2014-12-31.csv" +ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/visualeditor_commits.csv" transform_commit_data <- function(filepath){ #basic, loading in the file @@ -10,10 +11,11 @@ transform_commit_data <- function(filepath){ temp_df <- df dir_path = dirname(filepath) file_name = basename(filepath) - + # TODO: this is project/event specific - event_date <- as.Date("2013-07-01") + #event_date <- as.Date("2013-07-01") + event_date <- as.Date("2013-06-06") # isolate project id project_id <- sub("_.*$", "", file_name) @@ -48,9 +50,10 @@ transform_commit_data <- function(filepath){ #filler for when there are weeks without commits all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) - complete_weeks_df <- expand.grid(relative_week = all_weeks, + complete_weeks_df <- expand.grid(relative_week = all_weeks, project_id = project_id, age = project_age) + #for each week, get the list of unique authors that committed cumulative_authors <- df %>% @@ -79,9 +82,10 @@ transform_commit_data <- function(filepath){ author_emails = list(unique(author_email)), committer_emails = list(unique(committer_email)), mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), - wikimedia_commit_count = sum(grepl("@wikimedia.org", author_email)), - l10n_commit_count = sum(grepl("l10n-bot@translatewiki.net", author_email)), - jenkins_commit_count = sum(grepl("@gerrit.wikimedia.org", author_email)), + wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)), + wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), + bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), + jenkins_commit_count = sum(grepl("jenkins-bot@gerrit.wikimedia.org|gerrit@wikimedia.org", author_email)), .groups = 'drop') |> right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> replace_na(list(commit_count = 0)) |> @@ -89,6 +93,7 @@ transform_commit_data <- function(filepath){ replace_na(list(l10n_commit_count = 0)) |> replace_na(list(jenkins_commit_count = 0)) |> replace_na(list(mediawiki_dev_commit_count = 0)) |> + replace_na(list(wikia_commit_count = 0)) |> mutate(before_after = if_else(relative_week < 0, 0, 1)) # then, to get the authorship details in # we check if the email data is present, if not we fill in blank @@ -126,8 +131,8 @@ transform_commit_data <- function(filepath){ test <- read.csv(ve_commit_fp, header = TRUE) transformed <- transform_commit_data(ve_commit_fp) -output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/0210_ve_weekly_count_data.csv" - +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/announcement_0215_ve_weekly_commit_count_data.csv" +project_id <- "test" write.csv(transformed, output_filepath, row.names = FALSE) diff --git a/commit_analysis/commit_plotting.R b/commit_analysis/commit_plotting.R new file mode 100644 index 0000000..6561218 --- /dev/null +++ b/commit_analysis/commit_plotting.R @@ -0,0 +1,40 @@ +library(tidyverse) +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_ve_weekly_commit_count_data.csv" +input_df <- read.csv(count_data_fp, header = TRUE) + +input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count + +library(scales) +library(ggplot2) + +time_plot <- input_df |> + ggplot(aes(x=relative_week, y=jenkins_commit_count)) + + labs(x="Weekly Offset", y="Gerrit/Jenkins Commit Count") + + geom_smooth() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot + +share_df <- input_df |> + mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |> + mutate(wikia_share = wikia_commit_count / nonbot_commit_count) |> + mutate(gerrit_share = jenkins_commit_count / nonbot_commit_count) |> + mutate(mw_dev_share = mediawiki_dev_commit_count / nonbot_commit_count) |> + mutate(other_share = (nonbot_commit_count - jenkins_commit_count - wikia_commit_count - wikimedia_commit_count - mediawiki_dev_commit_count) / nonbot_commit_count)|> + drop_na() + +share_long <- share_df |> + select(relative_week, wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share) |> + pivot_longer(cols = c(wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share), names_to = "category", values_to = "share") + +share_plot <- share_long |> + ggplot(aes(x=relative_week, y=share, color=category)) + + geom_smooth() + + geom_vline(xintercept = 0)+ + labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") + + ggtitle("Weekly Share of Nonbot Commit Count by Category") + + theme_bw() + + theme(legend.position = "top") +share_plot + diff --git a/commit_analysis/mlm.R b/commit_analysis/mlm.R deleted file mode 100644 index 20fe625..0000000 --- a/commit_analysis/mlm.R +++ /dev/null @@ -1,18 +0,0 @@ -count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/0210_ve_weekly_count_data.csv" -input_df <- read_csv(count_data_fp) - -input_df$nonbot_commit_count <- input_df$commit_count - input_df$l10n_commit_count - input_df$jenkins_commit_count - -input_df <- input_df |> - filter(relative_week < 79) -library(scales) -library(ggplot2) - -time_plot <- input_df |> - ggplot(aes(x=relative_week, y=wikimedia_commit_count)) + - labs(x="Weekly Offset", y="WMF Commit Count") + - geom_smooth() + - geom_vline(xintercept = 0)+ - theme_bw() + - theme(legend.position = "top") -time_plot diff --git a/commit_analysis/models.R b/commit_analysis/models.R new file mode 100644 index 0000000..e7afab3 --- /dev/null +++ b/commit_analysis/models.R @@ -0,0 +1,45 @@ +library(tidyverse) +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_ve_weekly_commit_count_data.csv" +input_df <- read.csv(count_data_fp, header = TRUE) + +library(rdd) + +var(input_df$commit_count) # 1253.343 +mean(input_df$commit_count) # 44.92381 +median(input_df$commit_count) # 39.5 + +get_optimal_bandwidth <- function(df){ + bw <- tryCatch({ + IKbandwidth(df$relative_week, df$commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") + }, error = function(e) { + NA + }) +} + +optimal_bandwidth <- get_optimal_bandwidth(input_df) + +window_num <- 19 +input_df <- input_df |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> + mutate(other_commit_count = commit_count - bot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count - jenkins_commit_count) + + +simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df) +summary(simple_model) + +library(lme4) +library(dplyr) +#get into mlm format +long_df <- input_df |> + pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, jenkins_commit_count, wikia_commit_count, mediawiki_dev_commit_count), + names_to = "commit_type", + values_to = "lengthened_commit_count") + +mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + (before_after*relative_week|commit_type), + control=glmerControl(optimizer="bobyqa", + optCtrl=list(maxfun=2e5)), nAGQ=0, + data=long_df) +summary(mlm) +ranefs <- ranef(mlm) +print(ranefs) +saveRDS(mlm, "021525_ve_event_mlm.rda")