1
0
mw-lifecycle-analysis/main_plot_script.R
2026-01-10 17:32:30 -08:00

216 lines
7.3 KiB
R

library(tidyverse)
library(dplyr)
library(tidyr)
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
#4.1
weekly_summary <- dsl_df |>
group_by(week_index, source, isAuthorWMF)|>
summarise(
tasks_made = sum(!is.na(resolution_outcome)),
count_resolution_outcome = sum(dsl_score),
author_closer_sum = sum(author_closer == TRUE),
median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
median_olmo_RK_prop_adac = median(olmo_RK_prop_adac),
median_comments_before_resolution = median(n_comments_before)
) |>
mutate(isAuthorWMF = factor(isAuthorWMF, levels = c("FALSE", "BzImport", "TRUE")))
tasks_created <- ggplot(
weekly_summary,
aes(
x=week_index,
y=tasks_made,
fill=isAuthorWMF
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c2"),
aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c3"),
aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c3"),
aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
geom_text(
data = subset(weekly_summary, source == "c1" & week_index ==10),
aes(x=week_index, y=120, label='Opt-out deployment'),
size = 3) +
geom_text(
data = subset(weekly_summary, source == "c1" & week_index == -21),
aes(x=week_index, y=120, label='Opt-in Testing'),
size = 3) +
geom_text(
data = subset(weekly_summary, source == "c2" & week_index == -18),
aes(x=week_index, y=20, label='Deployment Announcement'),
size = 3) +
theme_minimal() +
scale_fill_viridis_d(
breaks = c("FALSE", "TRUE", "BzImport"),
labels = c("Nonaffiliate", "WMF-affiliate", "BzImport")
) +
labs(
x = "Weeks from Feature Deployment",
y = "Count of Tasks Created",
fill = "Task Author"
) +
theme(legend.position = "top")
tasks_created
ggsave(
filename = "011025_tasks_created.png",
plot = tasks_created,
width = 8, # inches
height = 4, # inches
dpi = 800 # high resolution
)
#4.2 plot comparing the TTR for different things
ttr_trajectory <- dsl_df |>
mutate(ttr_weeks = TTR_hours / 168) |>
mutate(isTriaged = if_else(priority == 'Needs Triage',
"Not Triaged",
"Triaged")) |>
group_by(week_index, isTriaged, source) |>
summarise(
count = n(),
mean_ttr = mean(ttr_weeks, na.rm = TRUE),
sd_ttr = sd(ttr_weeks, na.rm = TRUE)
)
ttr_trajectory_plot <- ttr_trajectory |>
filter(week_index >= -13) |>
filter(isTriaged == "Not Triaged") |>
ggplot(aes(x = week_index)) +
# Line for mean TTR
geom_line(aes(y = mean_ttr, color = "Mean TTR"), linewidth = 1) +
# Ribbon for standard deviation
geom_ribbon(aes(ymin = mean_ttr - sd_ttr, ymax = mean_ttr + sd_ttr),
fill = "lightblue", alpha = 0.4) +
# Line for count of tasks
geom_point(aes(y = count,
color = "Count of New Tasks"), linewidth = 1, linetype = "dashed") +
# Facet the plot by source and triaged status
facet_wrap(source ~ isTriaged, scales = "free_y") +
labs(
title = "TTR by Source and Triage Status (TODO)",
x = "Week Index",
y = "Mean TTR (in weeks)",
color = "Metrics"
) +
scale_color_manual(values = c("Mean TTR" = "blue", "Count of New Tasks" = "red")) +
theme_minimal()
ttr_trajectory_plot
ttr_boxplot <- dsl_df |>
filter(priority == "Needs Triage" |
priority == "Unbreak Now!" |
priority == "High") |>
filter(week_index >= -13) |>
ggplot(
aes(
x=as.factor(week_index),
y= TTR_hours/168,
color=priority,
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_boxplot(outlier.shape = NA) +
theme_minimal() +
coord_cartesian(ylim = c(0, 112)) +
geom_text(
data = subset(dsl_df |>
filter(priority == "Needs Triage" |
priority == "Unbreak Now!" |
priority == "High"), source == "c1" & week_index == 12),
aes(x=week_index, y=80, label='Opt-in Testing'),
color = "black",
size = 3) +
geom_vline(xintercept =14, linetype = "dashed", color = "black", linewidth = 0.5) +
scale_color_viridis_d(option='turbo') +
labs(x = "Weeks from Release",
y = "Time to Resolution (weeks)",
color = "Priority Tag") +
theme(legend.position = "top")
ttr_boxplot
ggsave(
filename = "011025_ttr_boxplot.png",
plot = ttr_boxplot,
width = 8, # inches
height = 4, # inches
dpi = 800 # high resolution
)
#4.3 plot comparing machine labels of information type
dsl_df <- dsl_df |>
filter(isAuthorWMF != "BzImport")
dsl_df_long <- dsl_df %>%
pivot_longer(
cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
names_to = "tag",
values_to = "proportion"
) %>%
mutate(tag = gsub("olmo_|_prop_adac", "", tag),
tag = case_when(
tag == "EP" ~ "Existent Problem",
tag == "RK" ~ "Record Keeping",
tag =="TSOL" ~ "Solutions"
))
olmo_comparison <- ggplot(
dsl_df_long,
aes(
x = tag,
y = proportion,
fill = isAuthorWMF,
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_boxplot() +
theme_minimal() +
scale_fill_viridis_d() +
labs(
x = "Issue Information Type Category",
y = "% of sentences machine-labeled",
color = "Is Author WMF?",
fill = "Is Author WMF?"
) +
theme(legend.position = "top")
olmo_comparison
ggsave(
filename = "011025_machine_label_comparison.png",
plot = olmo_comparison,
width = 8, # inches
height = 4, # inches
dpi = 800 # high resolution
)