1
0

initial gerrit analysis for ve

This commit is contained in:
Matthew Gaughan 2025-02-16 14:08:16 -08:00
parent f08f21a67e
commit 7f8b885ef4
5 changed files with 139 additions and 30 deletions

View File

@ -59,24 +59,25 @@ count_mean <- mean(combined_df$count) #628.119
count_var_to_mean <- count_var / count_mean #2262140.471
# the mean count values for each day
summary_df <- combined_df |>
wiki_summary_df <- combined_df |>
filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |>
filter(wiki_db == "enwiki") |>
group_by(date) |>
summarize(
mean_count = mean(count),
sum_count = sum(count),
median_count = median(count)
)
#plotting it
p1 <- ggplot(summary_df, aes(x = date, y = median_count)) +
geom_line(color = "blue") + # Line plot
geom_point(color = "red") + # Points on the line
p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) +
geom_line(color = "blue") + # Line plot # Points on the line
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
labs(title = "Median Bot Actions",
x = "Date",
y = "Median Count") +
labs(title = "enwiki Total Bot Actions",
x = "Date (daily)",
y = "Action Count") +
theme_minimal()
p1
p1_5 <- ggplot(summary_df, aes(x = date)) +
geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
p1_5 <- ggplot(enwiki_summary_df, aes(x = date)) +
geom_smooth(aes(y = sum_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
#geom_point(color = "red") + # Points on the line
labs(title = "Median Bot Actions",
x = "Date",

View File

@ -36,9 +36,9 @@ transform_commit_data <- function(filepath){
#we are looking at weekly data, 6m before and 6m after
#start_date <- event_date %m-% months(6)
calculated_start_date <- event_date %m-% months(24)
calculated_start_date <- event_date %m-% months(12)
start_date <- max(calculated_start_date, oldest_commit_date)
end_date <- event_date %m+% months(24)
end_date <- event_date %m+% months(12)
#getting the relative weeks to the publication date
relative_week <- function(date, ref_date) {

View File

@ -0,0 +1,65 @@
library(tidyverse)
gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv"
input_df <- read.csv(gerrit_fp, header = TRUE)
transform_gerrit_data <- function(data_fp, projects){
#loading in
df <- read.csv(data_fp, header = TRUE)
#some initial pre-processing
df <- df |>
filter(project %in% projects) |>
mutate(created = ymd_hms(created)) |>
mutate(updated = ymd_hms(updated)) |>
mutate(submitted = ymd_hms(submitted))
event_date <- as.Date("2013-07-01")
oldest_created_date <- min(as.Date(df$created))
calculated_start_date <- event_date %m-% months(24)
start_date <- max(calculated_start_date, oldest_created_date)
end_date <- event_date %m+% months(24)
relative_week <- function(date, ref_date) {
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}
#going off of the created date
df <- df |>
mutate(relative_week = relative_week(created, event_date)) |>
mutate(create_update_delta = as.numeric(difftime(updated, created, units="days")))
#creating filler zeros
unique_projects <- unique(df$project)
unique_statuses <- unique(df$status)
all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date))
complete_weeks_df <- expand.grid(
project = unique_projects,
relative_week = all_weeks,
status = unique_statuses
)
weekly_commits <- df |>
group_by(project, relative_week, status) |>
summarise(task_count = n(),
avg_resolution_time = mean(create_update_delta, na.rm = TRUE),
avg_insertions = mean(insertions, na.rm=TRUE),
avg_deletions = mean(deletions, na.rm=TRUE),
.groups = 'drop') |>
right_join(complete_weeks_df,
by=c("project", "relative_week", "status")) |>
replace_na(list(task_count = 0)) |>
replace_na(list(avg_resolution_time = 0 ))
return(weekly_commits)
}
transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor",
"mediawiki/extensions",
"mediawiki/core",
"mediawiki/extensions/VisualEditor"))
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv"
write.csv(transformed_data, output_filepath, row.names = FALSE)

View File

@ -0,0 +1,61 @@
library(tidyverse)
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv"
input_df <- read.csv(count_data_fp, header = TRUE)
#window_num <- 19
window_num <- 52
input_df <- input_df |>
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor",
"mediawiki/extensions", project))
library(scales)
library(ggplot2)
time_plot <- input_df |>
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") +
geom_smooth() +
geom_vline(xintercept = 0)+
theme_bw() +
theme(legend.position = "top")
time_plot
abandoned_df <- input_df |>
filter(status == "ABANDONED")
time_plot <- abandoned_df |>
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") +
geom_line() +
geom_vline(xintercept = 0)+
theme_bw() +
theme(legend.position = "top")
time_plot
delta_df <- input_df |>
filter(task_count != 0) |>
filter(relative_week >= (- 12))
time_plot <- delta_df |>
ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) +
labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") +
geom_line() +
geom_vline(xintercept = 0)+
theme_bw() +
theme(legend.position = "top")
time_plot
loc_df <- input_df |>
filter(task_count != 0) |>
filter(status != "ABANDONED")
time_plot <- loc_df |>
ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) +
labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") +
geom_line() +
geom_vline(xintercept = 0)+
theme_bw() +
theme(legend.position = "top")
time_plot

View File

@ -1,18 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:56403 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: 8aREIl1dg0jceqBTNl9a
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 24263466
slurmstepd: error: *** JOB 24263466 ON n3439 CANCELLED AT 2025-02-15T16:52:54 ***