diff --git a/analysis_data/scratch.R b/analysis_data/scratch.R index 3263733..218ba54 100644 --- a/analysis_data/scratch.R +++ b/analysis_data/scratch.R @@ -2,24 +2,76 @@ library(tidyverse) main_csv <-"~/analysis_data/120725_unified.csv" main_df <- read.csv(main_csv, header = TRUE) -author_closer <- main_df |> - filter(comment_type == "task_description") |> - filter(author_closer == "True") -table(author_closer$isAuthorWMF) +dsl_csv <-"~/dsl/120725_DSL_frame.csv" +dsl_df <- read.csv(dsl_csv, header = TRUE) + +needs_triage <- dsl_df |> + filter(week_index >= -4) |> + filter(priority=="Needs Triage") |> + mutate(ttr_weeks = TTR/168) |> + group_by(source, isAuthorWMF)|> + summarize( + mean_ttr_weeks = mean(ttr_weeks), + sd_ttr_weeks = sd(ttr_weeks) + ) + +changes<- dsl_df |> + filter(priority == "Needs Triage" | + priority == "Unbreak Now!" | + priority == "High") |> + mutate( + period = case_when( + week_index >= -4 & week_index <= 4 ~ "8 weeks after announcement", + week_index >= -13 & week_index <= -5 ~ "8 weeks before deployment announcement", + TRUE ~ NA + ) + ) %>% + filter(!is.na(period)) |> + mutate(ttr_weeks = TTR/168) |> + group_by(source, period, priority) %>% + summarise( + count = n(), + mean_ttr_weeks = mean(ttr_weeks, na.rm = TRUE), + sd_ttr_weeks = sd(ttr_weeks, na.rm = TRUE), + ) -new_authors_summary <- main_df |> +# new contributors +first_task <- main_df |> filter(comment_type == "task_description") |> group_by(source, AuthorPHID) |> summarise( task_count = n(), - first_task = min(week_index) - ) |> - group_by(first_task, source) |> - summarise( - new_authors_count = n() - ) -ggplot(new_authors_summary, aes(x = first_task, y = new_authors_count)) + + first_task_week = min(week_index) + ) + +tasks_flagged <- main_df %>% + filter(comment_type == "task_description") |> + left_join(first_task, by = c("source", "AuthorPHID")) %>% + mutate(is_first_time_author = week_index == first_task_week) + +summary_df <- tasks_flagged %>% + group_by(week_index, source) %>% + summarize( + total_tasks = n(), + first_time_tasks = sum(is_first_time_author), + proportion_first_time = first_time_tasks / total_tasks + ) %>% + ungroup() + +period_counts <- summary_df %>% + mutate( + period = case_when( + week_index >= -4 & week_index <= 4 ~ "recent", + week_index >= -13 & week_index <= -5 ~ "prior", + TRUE ~ NA_character_ + ) + ) %>% + filter(!is.na(period)) %>% + group_by(source, period) %>% + summarize(period_first_time_tasks = sum(first_time_tasks), .groups = "drop") + +ggplot(summary_df, aes(x = week_index, y = first_time_tasks)) + facet_grid(source ~ ., scales = "free_y", labeller = labeller(source = c("c1" = "VisualEditor", diff --git a/doc_plots/120825_dsl_coefs.png b/doc_plots/120825_dsl_coefs.png new file mode 100644 index 0000000..17e9c2c Binary files /dev/null and b/doc_plots/120825_dsl_coefs.png differ diff --git a/doc_plots/rq1_plots/120825_c1_ttr.png b/doc_plots/rq1_plots/120825_c1_ttr.png index 3971b7b..f531227 100644 Binary files a/doc_plots/rq1_plots/120825_c1_ttr.png and b/doc_plots/rq1_plots/120825_c1_ttr.png differ diff --git a/doc_plots/rq1_plots/120825_tasks_created.png b/doc_plots/rq1_plots/120825_tasks_created.png index 539d6e1..4ec1865 100644 Binary files a/doc_plots/rq1_plots/120825_tasks_created.png and b/doc_plots/rq1_plots/120825_tasks_created.png differ diff --git a/doc_plots/rq1_plots/120825_tasks_status.png b/doc_plots/rq1_plots/120825_tasks_status.png index fb240bf..5cce9fe 100644 Binary files a/doc_plots/rq1_plots/120825_tasks_status.png and b/doc_plots/rq1_plots/120825_tasks_status.png differ diff --git a/120725_logit_dsl.RDS b/dsl/120725_logit_dsl.RDS similarity index 100% rename from 120725_logit_dsl.RDS rename to dsl/120725_logit_dsl.RDS diff --git a/dsl/dsl.R b/dsl/dsl.R index d9e43bb..4095ada 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -80,9 +80,9 @@ dev_model <- dsl( sample_split = 20, data=dsl_df ) -summary(dev_model) -saveRDS(dev_model, "120725_logit_dsl.RDS") - +#summary(dev_model) +#saveRDS(dev_model, "120725_logit_dsl.RDS") +dev_model <- readRDS("dsl/120725_logit_dsl.RDS") library(broom) library(dplyr) tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { @@ -138,7 +138,7 @@ coef_df <- coef_df |> "WMF-affiliate Author:HTTP-deprecation" ))) ) -ggplot(coef_df, aes(x = estimate, y = term)) + +dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + @@ -146,3 +146,10 @@ ggplot(coef_df, aes(x = estimate, y = term)) + x = "Coefficient Estimate", y = "Variable") + theme_minimal() +ggsave( + filename = "120825_dsl_coefs.png", + plot = dsl_coefs, + width = 8, # inches + height = 6, # inches + dpi = 600 # high resolution +) diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R index 383bf80..c4f0413 100644 --- a/dsl/final_bivariate.R +++ b/dsl/final_bivariate.R @@ -93,15 +93,14 @@ c1_ttr_plot <- dsl_df |> size = 4) + labs(x = "Weeks from Release", y = "Time to Resolution (weeks)", - fill = "Priority Tag", - title = "VisualEditor Time to Resolution by Triage Priority") + + fill = "Priority Tag") + theme(legend.position = "top") c1_ttr_plot ggsave( filename = "120825_c1_ttr.png", plot = c1_ttr_plot, width = 12, # inches - height = 4, # inches + height = 6, # inches dpi = 600 # high resolution ) @@ -204,7 +203,6 @@ tasks_created <- ggplot( labs( x = "Weeks from Feature Deployment", y = "Count of Tasks Created", - title = "Phabricator Tasks Created by Week and Author Affiliation", fill = "Task Author Affiliated with WMF?" ) + theme(legend.position = "top") @@ -213,7 +211,7 @@ ggsave( filename = "120825_tasks_created.png", plot = tasks_created, width = 12, # inches - height = 4, # inches + height = 6, # inches dpi = 600 # high resolution ) diff --git a/dsl/rq1_plots.R b/dsl/rq1_plots.R index 20aca6b..6902950 100644 --- a/dsl/rq1_plots.R +++ b/dsl/rq1_plots.R @@ -88,8 +88,7 @@ task_status_plot <- declined_summary|> "c3" = "HTTP-deprecation"))) + geom_col(position = position_dodge(width = 0.9), width = 0.8) + scale_fill_viridis_d(option='magma') + - labs(title = "Task Status (as of February 28, 2025) by Week", - x = "Weeks from feature deployment", + labs(x = "Weeks from feature deployment", y = "% of items in status", fill = "Task Status") + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + @@ -103,7 +102,7 @@ ggsave( filename = "120825_tasks_status.png", plot = task_status_plot, width = 12, # inches - height = 4, # inches + height = 6, # inches dpi = 600 # high resolution )