backup, trying to parse text
This commit is contained in:
parent
7f8b885ef4
commit
9c6034ca30
@ -13,3 +13,7 @@ ls
|
|||||||
rm event_0215_ve_weekly_commit_count_data.csv
|
rm event_0215_ve_weekly_commit_count_data.csv
|
||||||
rm announcement_0215_ve_weekly_commit_count_data.csv
|
rm announcement_0215_ve_weekly_commit_count_data.csv
|
||||||
ls
|
ls
|
||||||
|
cd ~
|
||||||
|
ls
|
||||||
|
ls .local
|
||||||
|
rm -r -f .local
|
||||||
|
4
.wget-hsts
Normal file
4
.wget-hsts
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# HSTS 1.0 Known Hosts database for GNU Wget.
|
||||||
|
# Edit at your own risk.
|
||||||
|
# <hostname> <port> <incl. subdomains> <created> <max-age>
|
||||||
|
www.python.org 0 1 1740094792 63072000
|
@ -60,7 +60,7 @@ count_var_to_mean <- count_var / count_mean #2262140.471
|
|||||||
|
|
||||||
# the mean count values for each day
|
# the mean count values for each day
|
||||||
wiki_summary_df <- combined_df |>
|
wiki_summary_df <- combined_df |>
|
||||||
filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |>
|
filter(date >= as.Date("2012-12-01") & date <= as.Date("2013-09-30")) |>
|
||||||
filter(wiki_db == "enwiki") |>
|
filter(wiki_db == "enwiki") |>
|
||||||
group_by(date) |>
|
group_by(date) |>
|
||||||
summarize(
|
summarize(
|
||||||
@ -70,8 +70,10 @@ wiki_summary_df <- combined_df |>
|
|||||||
#plotting it
|
#plotting it
|
||||||
p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) +
|
p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) +
|
||||||
geom_line(color = "blue") + # Line plot # Points on the line
|
geom_line(color = "blue") + # Line plot # Points on the line
|
||||||
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
|
geom_vline(xintercept = as.Date("2013-07-02"), linetype = "dashed", color = "black") +
|
||||||
labs(title = "enwiki Total Bot Actions",
|
geom_vline(xintercept = as.Date("2013-04-28"), linetype = "dashed", color = "black") +
|
||||||
|
geom_vline(xintercept = as.Date("2012-12-12"), linetype = "dashed", color = "black") +
|
||||||
|
labs(title = "enwiki Bot Actions 2012-12 to 2013-10",
|
||||||
x = "Date (daily)",
|
x = "Date (daily)",
|
||||||
y = "Action Count") +
|
y = "Action Count") +
|
||||||
theme_minimal()
|
theme_minimal()
|
||||||
|
@ -3,7 +3,7 @@ library(dplyr)
|
|||||||
library(lubridate)
|
library(lubridate)
|
||||||
library(tidyr)
|
library(tidyr)
|
||||||
|
|
||||||
ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/mediawiki_core_commits.csv"
|
ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/extensions_visualeditor_commits.csv"
|
||||||
|
|
||||||
transform_commit_data <- function(filepath){
|
transform_commit_data <- function(filepath){
|
||||||
#basic, loading in the file
|
#basic, loading in the file
|
||||||
@ -14,8 +14,9 @@ transform_commit_data <- function(filepath){
|
|||||||
|
|
||||||
|
|
||||||
# TODO: this is project/event specific
|
# TODO: this is project/event specific
|
||||||
#event_date <- as.Date("2013-07-01")
|
event_date <- as.Date("2013-07-01")
|
||||||
event_date <- as.Date("2013-06-06")
|
#event_date <- as.Date("2013-04-28")
|
||||||
|
#event_date <- as.Date("2012-12-12")
|
||||||
|
|
||||||
# isolate project id
|
# isolate project id
|
||||||
project_id <- sub("_.*$", "", file_name)
|
project_id <- sub("_.*$", "", file_name)
|
||||||
@ -68,7 +69,8 @@ transform_commit_data <- function(filepath){
|
|||||||
|
|
||||||
#now cut out the commit data that we don't care about
|
#now cut out the commit data that we don't care about
|
||||||
df <- df |>
|
df <- df |>
|
||||||
filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date)
|
filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) |>
|
||||||
|
filter(author_email != "jenkins-bot@gerrit.wikimedia.org")
|
||||||
|
|
||||||
#in order:
|
#in order:
|
||||||
# - we group by project, week, ages
|
# - we group by project, week, ages
|
||||||
@ -85,7 +87,6 @@ transform_commit_data <- function(filepath){
|
|||||||
wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)),
|
wikimedia_commit_count = sum(grepl("@wikimedia.org|@wikimedia.de", author_email)),
|
||||||
wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)),
|
wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)),
|
||||||
bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)),
|
bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)),
|
||||||
jenkins_commit_count = sum(grepl("jenkins-bot@gerrit.wikimedia.org|gerrit@wikimedia.org", author_email)),
|
|
||||||
.groups = 'drop') |>
|
.groups = 'drop') |>
|
||||||
right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |>
|
right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |>
|
||||||
replace_na(list(commit_count = 0)) |>
|
replace_na(list(commit_count = 0)) |>
|
||||||
@ -131,7 +132,7 @@ transform_commit_data <- function(filepath){
|
|||||||
|
|
||||||
test <- read.csv(ve_commit_fp, header = TRUE)
|
test <- read.csv(ve_commit_fp, header = TRUE)
|
||||||
transformed <- transform_commit_data(ve_commit_fp)
|
transformed <- transform_commit_data(ve_commit_fp)
|
||||||
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/announcement_0215_core_weekly_commit_count_data.csv"
|
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
|
||||||
write.csv(transformed, output_filepath, row.names = FALSE)
|
write.csv(transformed, output_filepath, row.names = FALSE)
|
||||||
|
|
||||||
|
@ -1,19 +1,24 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_ve_weekly_commit_count_data.csv"
|
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
input_df <- read.csv(count_data_fp, header = TRUE)
|
input_df <- read.csv(count_data_fp, header = TRUE)
|
||||||
|
|
||||||
input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count
|
input_df$nonbot_commit_count <- input_df$commit_count - input_df$bot_commit_count
|
||||||
|
|
||||||
window_num <- 19
|
window_num <- 52
|
||||||
input_df <- input_df |>
|
input_df <- input_df |>
|
||||||
filter(relative_week >= (- window_num) & relative_week <= (window_num))
|
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
||||||
|
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
||||||
|
mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |>
|
||||||
|
mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |>
|
||||||
|
select(-mediawiki_dev_commit_count) |>
|
||||||
|
select(-wikia_commit_count)
|
||||||
|
|
||||||
library(scales)
|
library(scales)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
|
|
||||||
time_plot <- input_df |>
|
time_plot <- input_df |>
|
||||||
ggplot(aes(x=relative_week, y=wikimedia_commit_count)) +
|
ggplot(aes(x=relative_week, y=nonbot_commit_count)) +
|
||||||
labs(x="Weekly Offset", y="Wikimedia Commit Count") +
|
labs(x="Weekly Offset", y="Nonbot Commit Count") +
|
||||||
geom_smooth() +
|
geom_smooth() +
|
||||||
geom_vline(xintercept = 0)+
|
geom_vline(xintercept = 0)+
|
||||||
theme_bw() +
|
theme_bw() +
|
||||||
@ -24,22 +29,24 @@ library(dplyr)
|
|||||||
|
|
||||||
share_df <- input_df |>
|
share_df <- input_df |>
|
||||||
mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |>
|
mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |>
|
||||||
mutate(wikia_share = wikia_commit_count / nonbot_commit_count) |>
|
mutate(other_share = other_commit_count / nonbot_commit_count)|>
|
||||||
mutate(gerrit_share = jenkins_commit_count / nonbot_commit_count) |>
|
|
||||||
mutate(mw_dev_share = mediawiki_dev_commit_count / nonbot_commit_count) |>
|
|
||||||
mutate(other_share = (nonbot_commit_count - jenkins_commit_count - wikia_commit_count - wikimedia_commit_count - mediawiki_dev_commit_count) / nonbot_commit_count)|>
|
|
||||||
drop_na()
|
drop_na()
|
||||||
|
|
||||||
share_long <- share_df |>
|
share_long <- share_df |>
|
||||||
dplyr::select(relative_week, wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share) |>
|
dplyr::select(relative_week, wikimedia_share, other_share) |>
|
||||||
pivot_longer(cols = c(wikimedia_share, wikia_share, gerrit_share, mw_dev_share, other_share), names_to = "category", values_to = "share")
|
pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share")
|
||||||
|
|
||||||
share_plot <- share_long |>
|
share_plot <- share_long |>
|
||||||
ggplot(aes(x=relative_week, y=share, color=category)) +
|
ggplot(aes(x=relative_week, y=share, color=category)) +
|
||||||
geom_smooth() +
|
geom_line() +
|
||||||
geom_vline(xintercept = 0)+
|
geom_vline(xintercept = 0)+
|
||||||
|
annotate("text", x = -7, y=1, label = "2012-12-12") +
|
||||||
|
geom_vline(xintercept = 19)+
|
||||||
|
annotate("text", x = 12, y=1, label = "2013-04-28") +
|
||||||
|
geom_vline(xintercept = 28)+
|
||||||
|
annotate("text", x = 35, y=1, label = "2013-07-01") +
|
||||||
labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") +
|
labs(x = "Relative Week", y = "Share of Nonbot Commit Count", color = "Affiliation") +
|
||||||
ggtitle("Weekly Share of Nonbot Commit Count by Affiliation") +
|
ggtitle("VE Weekly Share of Nonbot Commit Count by Affiliation (enwiki opt-in testing 2012-12-12)") +
|
||||||
theme_bw() +
|
theme_bw() +
|
||||||
theme(legend.position = "top")
|
theme(legend.position = "top")
|
||||||
share_plot
|
share_plot
|
||||||
|
132
commit_analysis/matched_rdd_models.R
Normal file
132
commit_analysis/matched_rdd_models.R
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing")
|
||||||
|
|
||||||
|
widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing")
|
||||||
|
|
||||||
|
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
|
||||||
|
|
||||||
|
input_df <- bind_rows(entest_df, widetest_df, event_df)
|
||||||
|
#input_df <- bind_rows(entest_df, widetest_df)
|
||||||
|
|
||||||
|
input_df <- input_df |>
|
||||||
|
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
||||||
|
mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |>
|
||||||
|
mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |>
|
||||||
|
dplyr::select(-mediawiki_dev_commit_count) |>
|
||||||
|
dplyr::select(-wikia_commit_count)
|
||||||
|
|
||||||
|
library(MASS)
|
||||||
|
|
||||||
|
library(lme4)
|
||||||
|
library(dplyr)
|
||||||
|
|
||||||
|
#get into mlm format
|
||||||
|
long_df <- input_df |>
|
||||||
|
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count),
|
||||||
|
names_to = "commit_type",
|
||||||
|
values_to = "lengthened_commit_count")
|
||||||
|
|
||||||
|
intermediate_long_df <- long_df |>
|
||||||
|
mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |>
|
||||||
|
mutate(log_commits = log1p(lengthened_commit_count))|>
|
||||||
|
mutate(scaled_long_commits = lengthened_commit_count / 10)
|
||||||
|
|
||||||
|
library(rdd)
|
||||||
|
|
||||||
|
intermediate_long_df <- intermediate_long_df |>
|
||||||
|
drop_na()
|
||||||
|
|
||||||
|
var(intermediate_long_df$commit_share) # 1253.343
|
||||||
|
mean(intermediate_long_df$commit_share) # 44.92381
|
||||||
|
median(intermediate_long_df$commit_share) # 39.5
|
||||||
|
|
||||||
|
get_optimal_bandwidth <- function(df){
|
||||||
|
bw <- tryCatch({
|
||||||
|
IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular")
|
||||||
|
}, error = function(e) {
|
||||||
|
NA
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df)
|
||||||
|
|
||||||
|
|
||||||
|
window_num <- 18
|
||||||
|
final_long_df <- intermediate_long_df |>
|
||||||
|
filter(relative_week >= (- window_num) & relative_week <= (window_num))
|
||||||
|
|
||||||
|
library(fitdistrplus)
|
||||||
|
descdist(long_df$lengthened_commit_count, discrete=FALSE)
|
||||||
|
#start_values <- list(shape1 = 1, shape2 = 1)
|
||||||
|
#fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial")
|
||||||
|
print(fit)
|
||||||
|
|
||||||
|
#NOTE should not run if you've already dropped NA
|
||||||
|
mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week +
|
||||||
|
(before_after*relative_week|commit_type) +
|
||||||
|
(before_after*relative_week|rd_event),
|
||||||
|
control=glmerControl(optimizer="bobyqa",
|
||||||
|
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
||||||
|
data=long_df)
|
||||||
|
#mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+
|
||||||
|
# (before_after*relative_week|commit_type) +
|
||||||
|
# (before_after*relative_week|rd_event) ,data=long_df)
|
||||||
|
summary(mlm)
|
||||||
|
qqnorm(residuals(mlm))
|
||||||
|
res <- ranef(mlm)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
#final_long_df <- final_long_df |>
|
||||||
|
# drop_na()
|
||||||
|
library(performance)
|
||||||
|
#descdist(long_df$commit_share, discrete=FALSE)
|
||||||
|
#fit <- MASS::fitdistr(as.numeric(long_df$commit_share), "normal")
|
||||||
|
#print(fit)
|
||||||
|
wikimedia_long_df <- final_long_df |>
|
||||||
|
filter(commit_type == "wikimedia_commit_count")
|
||||||
|
wikimedia_share_lmer <- lmer(commit_share ~ before_after*relative_week +
|
||||||
|
(1| rd_event),
|
||||||
|
data=wikimedia_long_df)
|
||||||
|
summary(wikimedia_share_lmer)
|
||||||
|
icc(wikimedia_share_lmer)
|
||||||
|
|
||||||
|
other_long_df <- final_long_df |>
|
||||||
|
filter(commit_type == "other_commit_count")
|
||||||
|
other_share_lmer <- lm(commit_share ~ before_after*relative_week,
|
||||||
|
data=other_long_df)
|
||||||
|
summary(other_share_lmer)
|
||||||
|
icc(other_share_lmer)
|
||||||
|
|
||||||
|
#power analysis
|
||||||
|
#library(simr)
|
||||||
|
#simrOptions(progress=FALSE)
|
||||||
|
|
||||||
|
## Intercept and slopes for intervention, time1, time2, intervention:time1, intervention:time2
|
||||||
|
#wmf_fixed <- c(0.511, -0.166, 0.002, 0.007)
|
||||||
|
## Random intercepts for participants clustered by class
|
||||||
|
#wmf_rand <- matrix(c(
|
||||||
|
# 0.01, 0.005, 0.002, 0.001,
|
||||||
|
# 0.005, 0.02, 0.003, 0.004,
|
||||||
|
# 0.002, 0.003, 0.015, 0.006,
|
||||||
|
# 0.001, 0.004, 0.006, 0.01
|
||||||
|
#), nrow=4, byrow=TRUE)
|
||||||
|
## residual variance
|
||||||
|
#wmf_res <- 0.2065
|
||||||
|
|
||||||
|
#wmf_model <- makeLmer(commit_share ~ before_after*relative_week + (before_after*relative_week | rd_event),
|
||||||
|
# fixef=wmf_fixed, VarCorr=wmf_rand, sigma=wmf_res, data=wikimedia_long_df)
|
||||||
|
|
||||||
|
#sim_treat <- powerSim(wmf_model, nsim=100, test = fcompare(commit_share~relative_week))
|
||||||
|
#sim_treat
|
||||||
|
|
||||||
|
#model_ext_subj <- extend(wmf_model, within="rd_event+before_after+relative_week", n=30)
|
||||||
|
#sim_treat_subj <- powerSim(model_ext_subj, nsim=100, test = fcompare(commit_share~before_after*relative_week))
|
||||||
|
#sim_treat_subj
|
||||||
|
|
||||||
|
#p_curve_treat <- powerCurve(model_ext_subj, test=fcompare(commit_share~before_after*relative_week),
|
||||||
|
# within="rd_event+before_after+relative_week",
|
||||||
|
# breaks=c(5,10,15,20))
|
||||||
|
#plot(p_curve_treat)
|
@ -1,6 +1,6 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0215_core_weekly_commit_count_data.csv"
|
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
input_df <- read.csv(count_data_fp, header = TRUE)
|
input_df <- read.csv(count_data_fp, header = TRUE)
|
||||||
|
|
||||||
library(rdd)
|
library(rdd)
|
||||||
|
|
||||||
@ -18,34 +18,53 @@ get_optimal_bandwidth <- function(df){
|
|||||||
|
|
||||||
optimal_bandwidth <- get_optimal_bandwidth(input_df)
|
optimal_bandwidth <- get_optimal_bandwidth(input_df)
|
||||||
|
|
||||||
window_num <- 13
|
window_num <- 8
|
||||||
input_df <- input_df |>
|
input_df <- input_df |>
|
||||||
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
||||||
mutate(other_commit_count = commit_count - bot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count - jenkins_commit_count)
|
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
||||||
|
mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |>
|
||||||
|
mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |>
|
||||||
|
select(-mediawiki_dev_commit_count) |>
|
||||||
|
select(-wikia_commit_count)
|
||||||
|
|
||||||
library(MASS)
|
#library(MASS)
|
||||||
|
|
||||||
simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df)
|
#simple_model <- glm.nb(commit_count~before_after*relative_week, data=input_df)
|
||||||
summary(simple_model)
|
#summary(simple_model)
|
||||||
|
|
||||||
library(lme4)
|
library(lme4)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
|
|
||||||
#get into mlm format
|
#get into mlm format
|
||||||
long_df <- input_df |>
|
long_df <- input_df |>
|
||||||
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count, jenkins_commit_count, wikia_commit_count, mediawiki_dev_commit_count),
|
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count),
|
||||||
names_to = "commit_type",
|
names_to = "commit_type",
|
||||||
values_to = "lengthened_commit_count")
|
values_to = "lengthened_commit_count")
|
||||||
|
|
||||||
long_df <- long_df |>
|
long_df <- long_df |>
|
||||||
mutate(commit_share = lengthened_commit_count / (commit_count - bot_commit_count)) |>
|
mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |>
|
||||||
mutate(log_commits = log1p(lengthened_commit_count))
|
mutate(log_commits = log1p(lengthened_commit_count))
|
||||||
|
|
||||||
mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + (before_after*relative_week|commit_type),
|
mlm <- glmer.nb(log_commits ~ before_after*relative_week + (before_after*relative_week|commit_type),
|
||||||
control=glmerControl(optimizer="bobyqa",
|
control=glmerControl(optimizer="bobyqa",
|
||||||
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
||||||
data=long_df)
|
data=long_df)
|
||||||
summary(mlm)
|
summary(mlm)
|
||||||
ranefs <- ranef(mlm)
|
ranefs <- ranef(mlm)
|
||||||
print(ranefs)
|
print(ranefs)
|
||||||
saveRDS(mlm, "021525_core-ve_event_mlm.rda")
|
#saveRDS(mlm, "021525_core-ve_event_mlm.rda")
|
||||||
|
|
||||||
|
share_df <- input_df |>
|
||||||
|
mutate(wikimedia_share = wikimedia_commit_count / nonbot_commit_count) |>
|
||||||
|
mutate(other_share = other_commit_count / nonbot_commit_count)|>
|
||||||
|
drop_na()
|
||||||
|
|
||||||
|
share_long <- share_df |>
|
||||||
|
dplyr::select(relative_week, wikimedia_share, other_share, before_after) |>
|
||||||
|
pivot_longer(cols = c(wikimedia_share, other_share), names_to = "category", values_to = "share")
|
||||||
|
|
||||||
|
share_mlm <- glmer.nb(share ~ before_after*relative_week + (before_after*relative_week|category),
|
||||||
|
control=glmerControl(optimizer="bobyqa",
|
||||||
|
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
||||||
|
data=share_long)
|
||||||
|
summary(share_mlm)
|
||||||
|
BIN
english-ewt-ud-2.5-191206.udpipe
Normal file
BIN
english-ewt-ud-2.5-191206.udpipe
Normal file
Binary file not shown.
@ -4,11 +4,10 @@ input_df <- read.csv(count_data_fp, header = TRUE)
|
|||||||
|
|
||||||
|
|
||||||
#window_num <- 19
|
#window_num <- 19
|
||||||
window_num <- 52
|
window_num <- 26
|
||||||
input_df <- input_df |>
|
input_df <- input_df |>
|
||||||
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
||||||
mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor",
|
mutate(parent_projects = project)
|
||||||
"mediawiki/extensions", project))
|
|
||||||
|
|
||||||
library(scales)
|
library(scales)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
@ -17,6 +16,7 @@ time_plot <- input_df |>
|
|||||||
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
||||||
labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") +
|
labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") +
|
||||||
geom_smooth() +
|
geom_smooth() +
|
||||||
|
geom_point() +
|
||||||
geom_vline(xintercept = 0)+
|
geom_vline(xintercept = 0)+
|
||||||
theme_bw() +
|
theme_bw() +
|
||||||
theme(legend.position = "top")
|
theme(legend.position = "top")
|
||||||
@ -28,7 +28,8 @@ abandoned_df <- input_df |>
|
|||||||
time_plot <- abandoned_df |>
|
time_plot <- abandoned_df |>
|
||||||
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
||||||
labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") +
|
labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") +
|
||||||
geom_line() +
|
geom_smooth() +
|
||||||
|
geom_point() +
|
||||||
geom_vline(xintercept = 0)+
|
geom_vline(xintercept = 0)+
|
||||||
theme_bw() +
|
theme_bw() +
|
||||||
theme(legend.position = "top")
|
theme(legend.position = "top")
|
||||||
@ -37,7 +38,8 @@ time_plot
|
|||||||
|
|
||||||
delta_df <- input_df |>
|
delta_df <- input_df |>
|
||||||
filter(task_count != 0) |>
|
filter(task_count != 0) |>
|
||||||
filter(relative_week >= (- 12))
|
filter(relative_week >= (- 12)) |>
|
||||||
|
filter(status != "ABANDONED")
|
||||||
time_plot <- delta_df |>
|
time_plot <- delta_df |>
|
||||||
ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) +
|
ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) +
|
||||||
labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") +
|
labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") +
|
||||||
@ -49,11 +51,12 @@ time_plot
|
|||||||
|
|
||||||
|
|
||||||
loc_df <- input_df |>
|
loc_df <- input_df |>
|
||||||
|
filter(relative_week >= (- 19) & relative_week <= (19)) |>
|
||||||
filter(task_count != 0) |>
|
filter(task_count != 0) |>
|
||||||
filter(status != "ABANDONED")
|
filter(status != "ABANDONED")
|
||||||
time_plot <- loc_df |>
|
time_plot <- loc_df |>
|
||||||
ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) +
|
ggplot(aes(x=relative_week, y=avg_insertions, color=parent_projects)) +
|
||||||
labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") +
|
labs(x="Weekly Offset", y="Avg. LOC Inserted per Accepted Gerrit Task", color = "Project") +
|
||||||
geom_line() +
|
geom_line() +
|
||||||
geom_vline(xintercept = 0)+
|
geom_vline(xintercept = 0)+
|
||||||
theme_bw() +
|
theme_bw() +
|
||||||
|
18
mgaughan-rstudio-server_24372601.out
Normal file
18
mgaughan-rstudio-server_24372601.out
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3441:34585 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: WoborOUQ79MgRq898+pw
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 24372601
|
||||||
|
slurmstepd: error: *** JOB 24372601 ON n3441 CANCELLED AT 2025-02-20T15:43:50 ***
|
@ -1,15 +1,48 @@
|
|||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/0205_convo_data/phab_data/visualeditor/0205_ve_phab_comments.csv"
|
phab_data_path <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0217_ve_phab_comments.csv"
|
||||||
phab_data <- read.csv(phab_data_path, header=TRUE)
|
phab_data <- read.csv(phab_data_path, header=TRUE)
|
||||||
|
|
||||||
phab_data <- phab_data |>
|
phab_data <- phab_data |>
|
||||||
mutate(has_ref = grepl("bots", comment_text)) |>
|
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", comment_text)) |>
|
||||||
|
mutate(has_bot_ref = grepl("bots|scripts|gadgets", comment_text)) |>
|
||||||
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
mutate(timestamp = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
||||||
filter(date_created < 1518232866 & date_created > 1356923678)
|
mutate(comment_id = row_number())|>
|
||||||
|
filter(date_created < 1383264000 & date_created > 1351728000)
|
||||||
|
#looking at all data between 11-1-2012 and 11-1-2013
|
||||||
|
|
||||||
|
length(unique(phab_data$date_created))
|
||||||
|
|
||||||
|
#g <- ggplot(phab_data, aes(x=timestamp, y=has_bot_ref)) +
|
||||||
|
# geom_point(alpha = 0.5) +
|
||||||
|
# theme_minimal()
|
||||||
|
#g
|
||||||
|
|
||||||
|
library(udpipe)
|
||||||
|
#library(rsyntax) https://github.com/vanatteveldt/rsyntax?tab=readme-ov-file
|
||||||
|
|
||||||
|
library(tidytext)
|
||||||
|
library(dplyr)
|
||||||
|
library(stringr)
|
||||||
|
|
||||||
|
# we first need to transform our comment level of analysis into sentences
|
||||||
|
sentence_level_data <- phab_data |>
|
||||||
|
unnest_tokens(sentence, comment_text, token = "sentences") |>
|
||||||
|
group_by(comment_id) |>
|
||||||
|
mutate(sentence_id = row_number())|>
|
||||||
|
dplyr::select(-has_bot_ref, -has_ref)|>
|
||||||
|
mutate(has_ref = grepl("visualeditor|VE|ve|VisualEditor", sentence)) |>
|
||||||
|
mutate(has_bot_ref = grepl("bots|scripts|gadgets", sentence)) |>
|
||||||
|
ungroup()
|
||||||
|
|
||||||
|
|
||||||
g <- ggplot(phab_data, aes(x=timestamp, y=has_ref)) +
|
library(udpipe)
|
||||||
geom_point(alpha = 0.5) +
|
library(rsyntax)
|
||||||
theme_minimal()
|
# Load necessary libraries
|
||||||
g
|
library(spacyr)
|
||||||
|
spacy_install()
|
||||||
|
#we only care about stuff that mentions VE rn, then tokenize
|
||||||
|
sentence_level_data <- sentence_level_data |>
|
||||||
|
filter(has_ref == TRUE) |>
|
||||||
|
mutate(sentence_tokens = udpipe(sentence, "english"))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user