initial gerrit analysis for ve
This commit is contained in:
		
							parent
							
								
									f08f21a67e
								
							
						
					
					
						commit
						7f8b885ef4
					
				| @ -59,24 +59,25 @@ count_mean <- mean(combined_df$count) #628.119 | ||||
| count_var_to_mean <- count_var / count_mean #2262140.471 | ||||
| 
 | ||||
| # the mean count values for each day | ||||
| summary_df <- combined_df |> | ||||
| wiki_summary_df <- combined_df |> | ||||
|   filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |> | ||||
|   filter(wiki_db == "enwiki") |> | ||||
|   group_by(date) |> | ||||
|   summarize( | ||||
|     mean_count = mean(count), | ||||
|     sum_count = sum(count), | ||||
|     median_count = median(count) | ||||
|   ) | ||||
| #plotting it | ||||
| p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + | ||||
|   geom_line(color = "blue") +     # Line plot | ||||
|   geom_point(color = "red") +     # Points on the line | ||||
| p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) + | ||||
|   geom_line(color = "blue") +     # Line plot    # Points on the line | ||||
|   geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + | ||||
|   labs(title = "Median Bot Actions", | ||||
|        x = "Date", | ||||
|        y = "Median Count") + | ||||
|   labs(title = "enwiki Total Bot Actions", | ||||
|        x = "Date (daily)", | ||||
|        y = "Action Count") + | ||||
|   theme_minimal()     | ||||
| p1 | ||||
| p1_5 <- ggplot(summary_df, aes(x = date)) + | ||||
|   geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + | ||||
| p1_5 <- ggplot(enwiki_summary_df, aes(x = date)) + | ||||
|   geom_smooth(aes(y = sum_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + | ||||
|   #geom_point(color = "red") +     # Points on the line | ||||
|   labs(title = "Median Bot Actions", | ||||
|        x = "Date", | ||||
|  | ||||
| @ -36,9 +36,9 @@ transform_commit_data <- function(filepath){ | ||||
|    | ||||
|   #we are looking at weekly data, 6m before and 6m after | ||||
|   #start_date <- event_date %m-% months(6) | ||||
|   calculated_start_date <- event_date %m-% months(24) | ||||
|   calculated_start_date <- event_date %m-% months(12) | ||||
|   start_date <- max(calculated_start_date, oldest_commit_date) | ||||
|   end_date <- event_date %m+% months(24) | ||||
|   end_date <- event_date %m+% months(12) | ||||
|    | ||||
|   #getting the relative weeks to the publication date | ||||
|   relative_week <- function(date, ref_date) { | ||||
|  | ||||
							
								
								
									
										65
									
								
								gerrit_analysis/make_gerrit_count_data.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								gerrit_analysis/make_gerrit_count_data.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,65 @@ | ||||
| library(tidyverse) | ||||
| 
 | ||||
| gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv" | ||||
| input_df <- read.csv(gerrit_fp, header = TRUE)  | ||||
| 
 | ||||
| transform_gerrit_data <- function(data_fp, projects){ | ||||
|   #loading in | ||||
|   df <- read.csv(data_fp, header = TRUE)  | ||||
|   #some initial pre-processing | ||||
|   df <- df |> | ||||
|     filter(project %in% projects) |> | ||||
|     mutate(created = ymd_hms(created)) |> | ||||
|     mutate(updated = ymd_hms(updated)) |> | ||||
|     mutate(submitted = ymd_hms(submitted)) | ||||
|    | ||||
|   event_date <- as.Date("2013-07-01") | ||||
|    | ||||
|   oldest_created_date <- min(as.Date(df$created)) | ||||
|   calculated_start_date <- event_date %m-% months(24) | ||||
|   start_date <- max(calculated_start_date, oldest_created_date) | ||||
|   end_date <- event_date %m+% months(24) | ||||
|    | ||||
|   relative_week <- function(date, ref_date) { | ||||
|     as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) | ||||
|   } | ||||
|    | ||||
|   #going off of the created date  | ||||
|   df <- df |> | ||||
|     mutate(relative_week = relative_week(created, event_date)) |> | ||||
|     mutate(create_update_delta = as.numeric(difftime(updated, created, units="days"))) | ||||
|    | ||||
|    | ||||
|   #creating filler zeros | ||||
|   unique_projects <- unique(df$project) | ||||
|   unique_statuses <- unique(df$status) | ||||
|   all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) | ||||
|   complete_weeks_df <- expand.grid( | ||||
|     project = unique_projects, | ||||
|     relative_week = all_weeks, | ||||
|     status = unique_statuses | ||||
|   ) | ||||
|    | ||||
|   weekly_commits <- df |> | ||||
|     group_by(project, relative_week, status) |> | ||||
|     summarise(task_count = n(), | ||||
|               avg_resolution_time = mean(create_update_delta, na.rm = TRUE), | ||||
|               avg_insertions = mean(insertions, na.rm=TRUE), | ||||
|               avg_deletions = mean(deletions, na.rm=TRUE), | ||||
|               .groups = 'drop') |> | ||||
|     right_join(complete_weeks_df,  | ||||
|                by=c("project", "relative_week", "status")) |> | ||||
|     replace_na(list(task_count = 0)) |> | ||||
|     replace_na(list(avg_resolution_time = 0 )) | ||||
|    | ||||
|   return(weekly_commits) | ||||
| } | ||||
| 
 | ||||
| transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor",  | ||||
|                                                        "mediawiki/extensions",  | ||||
|                                                        "mediawiki/core",  | ||||
|                                                        "mediawiki/extensions/VisualEditor")) | ||||
| 
 | ||||
| output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv" | ||||
| 
 | ||||
| write.csv(transformed_data, output_filepath, row.names = FALSE) | ||||
							
								
								
									
										61
									
								
								gerrit_analysis/plotting_gerrit.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								gerrit_analysis/plotting_gerrit.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,61 @@ | ||||
| library(tidyverse) | ||||
| count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv" | ||||
| input_df <- read.csv(count_data_fp, header = TRUE)  | ||||
| 
 | ||||
| 
 | ||||
| #window_num <- 19 | ||||
| window_num <- 52 | ||||
| input_df <- input_df |> | ||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> | ||||
|   mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor",  | ||||
|                                                        "mediawiki/extensions", project)) | ||||
| 
 | ||||
| library(scales) | ||||
| library(ggplot2) | ||||
| 
 | ||||
| time_plot <- input_df |> | ||||
|   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + | ||||
|   labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") + | ||||
|   geom_smooth() + | ||||
|   geom_vline(xintercept = 0)+ | ||||
|   theme_bw() + | ||||
|   theme(legend.position = "top") | ||||
| time_plot | ||||
| 
 | ||||
| 
 | ||||
| abandoned_df <- input_df |> | ||||
|   filter(status == "ABANDONED") | ||||
| time_plot <- abandoned_df |> | ||||
|   ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + | ||||
|   labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") + | ||||
|   geom_line() + | ||||
|   geom_vline(xintercept = 0)+ | ||||
|   theme_bw() + | ||||
|   theme(legend.position = "top") | ||||
| time_plot | ||||
| 
 | ||||
| 
 | ||||
| delta_df <- input_df |> | ||||
|   filter(task_count != 0) |> | ||||
|   filter(relative_week >= (- 12))  | ||||
| time_plot <- delta_df |> | ||||
|   ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) + | ||||
|   labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") + | ||||
|   geom_line() + | ||||
|   geom_vline(xintercept = 0)+ | ||||
|   theme_bw() + | ||||
|   theme(legend.position = "top") | ||||
| time_plot | ||||
| 
 | ||||
| 
 | ||||
| loc_df <- input_df |> | ||||
|   filter(task_count != 0) |> | ||||
|   filter(status != "ABANDONED") | ||||
| time_plot <- loc_df |> | ||||
|   ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) + | ||||
|   labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") + | ||||
|   geom_line() + | ||||
|   geom_vline(xintercept = 0)+ | ||||
|   theme_bw() + | ||||
|   theme(legend.position = "top") | ||||
| time_plot | ||||
| @ -1,18 +0,0 @@ | ||||
| 1. SSH tunnel from your workstation using the following command: | ||||
| 
 | ||||
|    ssh -N -L 8787:n3439:56403 mjilg@klone.hyak.uw.edu | ||||
| 
 | ||||
|    and point your web browser to http://localhost:8787 | ||||
| 
 | ||||
| 2. log in to RStudio Server using the following credentials: | ||||
| 
 | ||||
|    user: mjilg | ||||
|    password: 8aREIl1dg0jceqBTNl9a | ||||
| 
 | ||||
| When done using RStudio Server, terminate the job by: | ||||
| 
 | ||||
| 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) | ||||
| 2. Issue the following command on the login node: | ||||
| 
 | ||||
|       scancel -f 24263466 | ||||
| slurmstepd: error: *** JOB 24263466 ON n3439 CANCELLED AT 2025-02-15T16:52:54 *** | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user