1
0

updating with revised plots and data

This commit is contained in:
Matthew Gaughan 2025-12-08 20:21:58 -08:00
parent e7e1bb3458
commit b0d4950bee
9 changed files with 80 additions and 24 deletions

View File

@ -2,24 +2,76 @@ library(tidyverse)
main_csv <-"~/analysis_data/120725_unified.csv"
main_df <- read.csv(main_csv, header = TRUE)
author_closer <- main_df |>
filter(comment_type == "task_description") |>
filter(author_closer == "True")
table(author_closer$isAuthorWMF)
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
needs_triage <- dsl_df |>
filter(week_index >= -4) |>
filter(priority=="Needs Triage") |>
mutate(ttr_weeks = TTR/168) |>
group_by(source, isAuthorWMF)|>
summarize(
mean_ttr_weeks = mean(ttr_weeks),
sd_ttr_weeks = sd(ttr_weeks)
)
changes<- dsl_df |>
filter(priority == "Needs Triage" |
priority == "Unbreak Now!" |
priority == "High") |>
mutate(
period = case_when(
week_index >= -4 & week_index <= 4 ~ "8 weeks after announcement",
week_index >= -13 & week_index <= -5 ~ "8 weeks before deployment announcement",
TRUE ~ NA
)
) %>%
filter(!is.na(period)) |>
mutate(ttr_weeks = TTR/168) |>
group_by(source, period, priority) %>%
summarise(
count = n(),
mean_ttr_weeks = mean(ttr_weeks, na.rm = TRUE),
sd_ttr_weeks = sd(ttr_weeks, na.rm = TRUE),
)
new_authors_summary <- main_df |>
# new contributors
first_task <- main_df |>
filter(comment_type == "task_description") |>
group_by(source, AuthorPHID) |>
summarise(
task_count = n(),
first_task = min(week_index)
) |>
group_by(first_task, source) |>
summarise(
new_authors_count = n()
first_task_week = min(week_index)
)
ggplot(new_authors_summary, aes(x = first_task, y = new_authors_count)) +
tasks_flagged <- main_df %>%
filter(comment_type == "task_description") |>
left_join(first_task, by = c("source", "AuthorPHID")) %>%
mutate(is_first_time_author = week_index == first_task_week)
summary_df <- tasks_flagged %>%
group_by(week_index, source) %>%
summarize(
total_tasks = n(),
first_time_tasks = sum(is_first_time_author),
proportion_first_time = first_time_tasks / total_tasks
) %>%
ungroup()
period_counts <- summary_df %>%
mutate(
period = case_when(
week_index >= -4 & week_index <= 4 ~ "recent",
week_index >= -13 & week_index <= -5 ~ "prior",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(period)) %>%
group_by(source, period) %>%
summarize(period_first_time_tasks = sum(first_time_tasks), .groups = "drop")
ggplot(summary_df, aes(x = week_index, y = first_time_tasks)) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",

Binary file not shown.

After

Width:  |  Height:  |  Size: 427 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 298 KiB

After

Width:  |  Height:  |  Size: 298 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 331 KiB

After

Width:  |  Height:  |  Size: 328 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 352 KiB

After

Width:  |  Height:  |  Size: 338 KiB

View File

@ -80,9 +80,9 @@ dev_model <- dsl(
sample_split = 20,
data=dsl_df
)
summary(dev_model)
saveRDS(dev_model, "120725_logit_dsl.RDS")
#summary(dev_model)
#saveRDS(dev_model, "120725_logit_dsl.RDS")
dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
library(broom)
library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
@ -138,7 +138,7 @@ coef_df <- coef_df |>
"WMF-affiliate Author:HTTP-deprecation"
)))
)
ggplot(coef_df, aes(x = estimate, y = term)) +
dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
geom_point(size = 1) +
geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
@ -146,3 +146,10 @@ ggplot(coef_df, aes(x = estimate, y = term)) +
x = "Coefficient Estimate",
y = "Variable") +
theme_minimal()
ggsave(
filename = "120825_dsl_coefs.png",
plot = dsl_coefs,
width = 8, # inches
height = 6, # inches
dpi = 600 # high resolution
)

View File

@ -93,15 +93,14 @@ c1_ttr_plot <- dsl_df |>
size = 4) +
labs(x = "Weeks from Release",
y = "Time to Resolution (weeks)",
fill = "Priority Tag",
title = "VisualEditor Time to Resolution by Triage Priority") +
fill = "Priority Tag") +
theme(legend.position = "top")
c1_ttr_plot
ggsave(
filename = "120825_c1_ttr.png",
plot = c1_ttr_plot,
width = 12, # inches
height = 4, # inches
height = 6, # inches
dpi = 600 # high resolution
)
@ -204,7 +203,6 @@ tasks_created <- ggplot(
labs(
x = "Weeks from Feature Deployment",
y = "Count of Tasks Created",
title = "Phabricator Tasks Created by Week and Author Affiliation",
fill = "Task Author Affiliated with WMF?"
) +
theme(legend.position = "top")
@ -213,7 +211,7 @@ ggsave(
filename = "120825_tasks_created.png",
plot = tasks_created,
width = 12, # inches
height = 4, # inches
height = 6, # inches
dpi = 600 # high resolution
)

View File

@ -88,8 +88,7 @@ task_status_plot <- declined_summary|>
"c3" = "HTTP-deprecation"))) +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
scale_fill_viridis_d(option='magma') +
labs(title = "Task Status (as of February 28, 2025) by Week",
x = "Weeks from feature deployment",
labs(x = "Weeks from feature deployment",
y = "% of items in status",
fill = "Task Status") +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
@ -103,7 +102,7 @@ ggsave(
filename = "120825_tasks_status.png",
plot = task_status_plot,
width = 12, # inches
height = 4, # inches
height = 6, # inches
dpi = 600 # high resolution
)