library(tidyverse) library(dplyr) library(tidyr) dsl_csv <-"~/dsl/121625_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) #4.1 weekly_summary <- dsl_df |> group_by(week_index, source, isAuthorWMF)|> summarise( tasks_made = sum(!is.na(resolution_outcome)), count_resolution_outcome = sum(dsl_score), author_closer_sum = sum(author_closer == TRUE), median_olmo_EP_prop_adac = median(olmo_EP_prop_adac), median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac), median_olmo_RK_prop_adac = median(olmo_RK_prop_adac), median_comments_before_resolution = median(n_comments_before) ) |> mutate(isAuthorWMF = factor(isAuthorWMF, levels = c("FALSE", "BzImport", "TRUE"))) tasks_created <- ggplot( weekly_summary, aes( x=week_index, y=tasks_made, fill=isAuthorWMF ) ) + facet_grid(source ~ ., scales = "free_y", labeller = labeller(source = c("c1" = "VisualEditor", "c2" = "HTTPS-login", "c3" = "HTTP-deprecation"))) + geom_col(position = position_dodge(width = 0.9), width = 0.8) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -29), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -9), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -4), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c2"), aes(xintercept = -99), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c2"), aes(xintercept = -4), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c3"), aes(xintercept = -97), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c3"), aes(xintercept = -3), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + geom_text( data = subset(weekly_summary, source == "c1" & week_index ==10), aes(x=week_index, y=120, label='Opt-out deployment'), size = 3) + geom_text( data = subset(weekly_summary, source == "c1" & week_index == -21), aes(x=week_index, y=120, label='Opt-in Testing'), size = 3) + geom_text( data = subset(weekly_summary, source == "c2" & week_index == -18), aes(x=week_index, y=20, label='Deployment Announcement'), size = 3) + theme_minimal() + scale_fill_viridis_d( breaks = c("FALSE", "TRUE", "BzImport"), labels = c("External Contributor", "WMF-affiliate", "BzImport") ) + labs( x = "Weeks from Feature Deployment", y = "Count of Tasks Created", fill = "Task Author" ) + theme(legend.position = "top") tasks_created ggsave( filename = "011925_tasks_created.png", plot = tasks_created, width = 8, # inches height = 4, # inches dpi = 800 # high resolution ) #4.2 plot comparing the TTR for different things ttr_trajectory <- dsl_df |> mutate(ttr_weeks = TTR_hours / 168) |> mutate(isTriaged = if_else(priority == 'Needs Triage', "Not Triaged", "Triaged")) |> group_by(week_index, isTriaged, source) |> summarise( count = n(), mean_ttr = mean(ttr_weeks, na.rm = TRUE), sd_ttr = sd(ttr_weeks, na.rm = TRUE) ) ttr_trajectory_plot <- ttr_trajectory |> filter(week_index >= -13) |> filter(isTriaged == "Not Triaged") |> ggplot(aes(x = week_index)) + # Line for mean TTR geom_line(aes(y = mean_ttr, color = "Mean TTR"), linewidth = 1) + # Ribbon for standard deviation geom_ribbon(aes(ymin = mean_ttr - sd_ttr, ymax = mean_ttr + sd_ttr), fill = "lightblue", alpha = 0.4) + # Line for count of tasks geom_point(aes(y = count, color = "Count of New Tasks"), linewidth = 1, linetype = "dashed") + # Facet the plot by source and triaged status facet_wrap(source ~ isTriaged, scales = "free_y") + labs( title = "TTR by Source and Triage Status (TODO)", x = "Week Index", y = "Mean TTR (in weeks)", color = "Metrics" ) + scale_color_manual(values = c("Mean TTR" = "blue", "Count of New Tasks" = "red")) + theme_minimal() ttr_trajectory_plot ttr_boxplot <- dsl_df |> filter(priority == "Needs Triage" | priority == "Unbreak Now!" | priority == "High") |> filter(week_index >= -13) |> ggplot( aes( x=as.factor(week_index), y= TTR_hours/168, color=priority, ) ) + facet_grid(source ~ ., scales = "free_y", labeller = labeller(source = c("c1" = "VisualEditor", "c2" = "HTTPS-login", "c3" = "HTTP-deprecation"))) + geom_boxplot(outlier.shape = NA) + theme_minimal() + coord_cartesian(ylim = c(0, 112)) + geom_text( data = subset(dsl_df |> filter(priority == "Needs Triage" | priority == "Unbreak Now!" | priority == "High"), source == "c1" & week_index == 12), aes(x=week_index, y=80, label='Opt-out Deployment'), color = "black", size = 3) + geom_vline(xintercept =14, linetype = "dashed", color = "black", linewidth = 0.5) + scale_color_viridis_d(option='turbo') + labs(x = "Weeks from Release", y = "Time to Resolution (weeks)", color = "Priority Tag") + theme(legend.position = "top") ttr_boxplot ggsave( filename = "011925_ttr_boxplot.png", plot = ttr_boxplot, width = 8, # inches height = 4, # inches dpi = 800 # high resolution ) #4.3 plot comparing machine labels of information type dsl_df_long <- dsl_df %>% filter(isAuthorWMF != "BzImport")|> pivot_longer( cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac), names_to = "tag", values_to = "proportion" ) %>% mutate(tag = gsub("olmo_|_prop_adac", "", tag), tag = case_when( tag == "EP" ~ "Existent Problem", tag == "RK" ~ "Record Keeping", tag =="TSOL" ~ "Solutions" )) olmo_comparison <- ggplot( dsl_df_long, aes( x = tag, y = proportion, fill = isAuthorWMF, ) ) + facet_grid(source ~ ., scales = "free_y", labeller = labeller(source = c("c1" = "VisualEditor", "c2" = "HTTPS-login", "c3" = "HTTP-deprecation"))) + geom_boxplot() + theme_minimal() + scale_fill_viridis_d( breaks = c("FALSE", "TRUE", "BzImport"), labels = c("External Contributor", "WMF-affiliate", "BzImport") ) + labs( x = "Issue Information Type Category", y = "% of sentences machine-labeled", color = "Is Author WMF?", fill = "Is Author WMF?" ) + theme(legend.position = "top") olmo_comparison ggsave( filename = "011925_machine_label_comparison.png", plot = olmo_comparison, width = 8, # inches height = 4, # inches dpi = 800 # high resolution )