initial gerrit analysis for ve
This commit is contained in:
parent
f08f21a67e
commit
7f8b885ef4
@ -59,24 +59,25 @@ count_mean <- mean(combined_df$count) #628.119
|
||||
count_var_to_mean <- count_var / count_mean #2262140.471
|
||||
|
||||
# the mean count values for each day
|
||||
summary_df <- combined_df |>
|
||||
wiki_summary_df <- combined_df |>
|
||||
filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |>
|
||||
filter(wiki_db == "enwiki") |>
|
||||
group_by(date) |>
|
||||
summarize(
|
||||
mean_count = mean(count),
|
||||
sum_count = sum(count),
|
||||
median_count = median(count)
|
||||
)
|
||||
#plotting it
|
||||
p1 <- ggplot(summary_df, aes(x = date, y = median_count)) +
|
||||
geom_line(color = "blue") + # Line plot
|
||||
geom_point(color = "red") + # Points on the line
|
||||
p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) +
|
||||
geom_line(color = "blue") + # Line plot # Points on the line
|
||||
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
|
||||
labs(title = "Median Bot Actions",
|
||||
x = "Date",
|
||||
y = "Median Count") +
|
||||
labs(title = "enwiki Total Bot Actions",
|
||||
x = "Date (daily)",
|
||||
y = "Action Count") +
|
||||
theme_minimal()
|
||||
p1
|
||||
p1_5 <- ggplot(summary_df, aes(x = date)) +
|
||||
geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
|
||||
p1_5 <- ggplot(enwiki_summary_df, aes(x = date)) +
|
||||
geom_smooth(aes(y = sum_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
|
||||
#geom_point(color = "red") + # Points on the line
|
||||
labs(title = "Median Bot Actions",
|
||||
x = "Date",
|
||||
|
@ -36,9 +36,9 @@ transform_commit_data <- function(filepath){
|
||||
|
||||
#we are looking at weekly data, 6m before and 6m after
|
||||
#start_date <- event_date %m-% months(6)
|
||||
calculated_start_date <- event_date %m-% months(24)
|
||||
calculated_start_date <- event_date %m-% months(12)
|
||||
start_date <- max(calculated_start_date, oldest_commit_date)
|
||||
end_date <- event_date %m+% months(24)
|
||||
end_date <- event_date %m+% months(12)
|
||||
|
||||
#getting the relative weeks to the publication date
|
||||
relative_week <- function(date, ref_date) {
|
||||
|
65
gerrit_analysis/make_gerrit_count_data.R
Normal file
65
gerrit_analysis/make_gerrit_count_data.R
Normal file
@ -0,0 +1,65 @@
|
||||
library(tidyverse)
|
||||
|
||||
gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv"
|
||||
input_df <- read.csv(gerrit_fp, header = TRUE)
|
||||
|
||||
transform_gerrit_data <- function(data_fp, projects){
|
||||
#loading in
|
||||
df <- read.csv(data_fp, header = TRUE)
|
||||
#some initial pre-processing
|
||||
df <- df |>
|
||||
filter(project %in% projects) |>
|
||||
mutate(created = ymd_hms(created)) |>
|
||||
mutate(updated = ymd_hms(updated)) |>
|
||||
mutate(submitted = ymd_hms(submitted))
|
||||
|
||||
event_date <- as.Date("2013-07-01")
|
||||
|
||||
oldest_created_date <- min(as.Date(df$created))
|
||||
calculated_start_date <- event_date %m-% months(24)
|
||||
start_date <- max(calculated_start_date, oldest_created_date)
|
||||
end_date <- event_date %m+% months(24)
|
||||
|
||||
relative_week <- function(date, ref_date) {
|
||||
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
|
||||
}
|
||||
|
||||
#going off of the created date
|
||||
df <- df |>
|
||||
mutate(relative_week = relative_week(created, event_date)) |>
|
||||
mutate(create_update_delta = as.numeric(difftime(updated, created, units="days")))
|
||||
|
||||
|
||||
#creating filler zeros
|
||||
unique_projects <- unique(df$project)
|
||||
unique_statuses <- unique(df$status)
|
||||
all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date))
|
||||
complete_weeks_df <- expand.grid(
|
||||
project = unique_projects,
|
||||
relative_week = all_weeks,
|
||||
status = unique_statuses
|
||||
)
|
||||
|
||||
weekly_commits <- df |>
|
||||
group_by(project, relative_week, status) |>
|
||||
summarise(task_count = n(),
|
||||
avg_resolution_time = mean(create_update_delta, na.rm = TRUE),
|
||||
avg_insertions = mean(insertions, na.rm=TRUE),
|
||||
avg_deletions = mean(deletions, na.rm=TRUE),
|
||||
.groups = 'drop') |>
|
||||
right_join(complete_weeks_df,
|
||||
by=c("project", "relative_week", "status")) |>
|
||||
replace_na(list(task_count = 0)) |>
|
||||
replace_na(list(avg_resolution_time = 0 ))
|
||||
|
||||
return(weekly_commits)
|
||||
}
|
||||
|
||||
transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor",
|
||||
"mediawiki/extensions",
|
||||
"mediawiki/core",
|
||||
"mediawiki/extensions/VisualEditor"))
|
||||
|
||||
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv"
|
||||
|
||||
write.csv(transformed_data, output_filepath, row.names = FALSE)
|
61
gerrit_analysis/plotting_gerrit.R
Normal file
61
gerrit_analysis/plotting_gerrit.R
Normal file
@ -0,0 +1,61 @@
|
||||
library(tidyverse)
|
||||
count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv"
|
||||
input_df <- read.csv(count_data_fp, header = TRUE)
|
||||
|
||||
|
||||
#window_num <- 19
|
||||
window_num <- 52
|
||||
input_df <- input_df |>
|
||||
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
|
||||
mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor",
|
||||
"mediawiki/extensions", project))
|
||||
|
||||
library(scales)
|
||||
library(ggplot2)
|
||||
|
||||
time_plot <- input_df |>
|
||||
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
||||
labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") +
|
||||
geom_smooth() +
|
||||
geom_vline(xintercept = 0)+
|
||||
theme_bw() +
|
||||
theme(legend.position = "top")
|
||||
time_plot
|
||||
|
||||
|
||||
abandoned_df <- input_df |>
|
||||
filter(status == "ABANDONED")
|
||||
time_plot <- abandoned_df |>
|
||||
ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) +
|
||||
labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") +
|
||||
geom_line() +
|
||||
geom_vline(xintercept = 0)+
|
||||
theme_bw() +
|
||||
theme(legend.position = "top")
|
||||
time_plot
|
||||
|
||||
|
||||
delta_df <- input_df |>
|
||||
filter(task_count != 0) |>
|
||||
filter(relative_week >= (- 12))
|
||||
time_plot <- delta_df |>
|
||||
ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) +
|
||||
labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") +
|
||||
geom_line() +
|
||||
geom_vline(xintercept = 0)+
|
||||
theme_bw() +
|
||||
theme(legend.position = "top")
|
||||
time_plot
|
||||
|
||||
|
||||
loc_df <- input_df |>
|
||||
filter(task_count != 0) |>
|
||||
filter(status != "ABANDONED")
|
||||
time_plot <- loc_df |>
|
||||
ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) +
|
||||
labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") +
|
||||
geom_line() +
|
||||
geom_vline(xintercept = 0)+
|
||||
theme_bw() +
|
||||
theme(legend.position = "top")
|
||||
time_plot
|
@ -1,18 +0,0 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:56403 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: 8aREIl1dg0jceqBTNl9a
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 24263466
|
||||
slurmstepd: error: *** JOB 24263466 ON n3439 CANCELLED AT 2025-02-15T16:52:54 ***
|
Loading…
Reference in New Issue
Block a user