library(tidyverse) #library(dsl) library(dplyr) dsl_csv <-"~/dsl/120725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- dsl_df |> mutate(priority = factor(priority, levels = rev(c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")))) data_summary <- dsl_df %>% group_by(week_index, priority, source) %>% summarise(count = n(), .groups = 'drop') |> group_by(week_index, source) %>% mutate(proportion = count / sum(count)) %>% ungroup() library(ggdist) data_summary|> filter(priority == "Needs Triage" | priority == "Unbreak Now!" | priority == "High") |> ggplot(aes(x = week_index, y = proportion, color = priority, fill=priority, group = priority)) + geom_smooth()+ scale_color_viridis_d(option='turbo') + scale_fill_viridis_d(option='turbo') + facet_grid(source ~ ., scales = "free_y") + geom_point() + labs(title = "Triage priority proportions for new tasks by week created", x = "Weeks from feature deployment", y = "% of items tagged", color = "Priority Tag") + theme_minimal() dsl_df |> filter(priority == "Needs Triage" | priority == "Unbreak Now!" | priority == "High") |> filter(week_index >= -26) |> ggplot( aes( x=as.factor(week_index), y= TTR/168, color=priority, fill=priority ) ) + facet_grid(source ~ .) + geom_boxplot(outlier.shape = NA) + theme_minimal() + coord_cartesian(ylim = c(0, 112)) + geom_vline(xintercept =27, linetype = "dashed", color = "black", linewidth = 0.5) + scale_color_viridis_d() + labs(x = "Weeks from Release", y = "Time to Resolution (weeks)", title = "TTR by Task Creation Date and Triage Priority") dsl_df_long <- dsl_df %>% pivot_longer( cols = c(olmo_EP_prop, olmo_RK_prop, olmo_TSOL_prop), names_to = "tag", values_to = "proportion" ) %>% mutate(tag = gsub("olmo_|_prop", "", tag), tag = case_when( tag == "EP" ~ "Existent Problem", tag == "RK" ~ "Record Keeping", tag =="TSOL" ~ "Solutions" )) ggplot( dsl_df_long, aes( x = tag, y = proportion, fill = isAuthorWMF, ) ) + facet_grid(source ~ .) + geom_boxplot() + theme_minimal() + scale_fill_viridis_d() + labs( x = "Tag", y = "% of sentences tagged", title = "Proportion of machine tags of sentence focus, by comment author affiliation", color = "Is Author WMF", fill = "Is Author WMF" ) weekly_summary <- dsl_df |> group_by(week_index, source, isAuthorWMF)|> summarise( tasks_made = sum(!is.na(resolution_outcome)), count_resolution_outcome = sum(dsl_score), author_closer_sum = sum(author_closer == TRUE), median_olmo_EP_prop_adac = median(olmo_EP_prop_adac), median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac), median_olmo_RK_prop_adac = median(olmo_RK_prop_adac), median_comments_before_resolution = median(n_comments_before) ) ggplot( weekly_summary, aes( x=week_index, y=tasks_made, fill=isAuthorWMF ) ) + facet_grid(source ~ ., scales = "free_y") + geom_col(position = position_dodge(width = 0.9), width = 0.8) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -29), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -9), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c1"), aes(xintercept = -4), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c2"), aes(xintercept = -99), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c2"), aes(xintercept = -4), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c3"), aes(xintercept = -97), linetype = "dotted", color = "black", linewidth = 0.5) + geom_vline(data = weekly_summary |> filter(source == "c3"), aes(xintercept = -3), linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + theme_minimal() + scale_fill_viridis_d() outcome_summary <- dsl_df |> group_by(source, isAuthorWMF)|> summarise( total_sum = sum(!is.na(resolution_outcome)), count_resolution_outcome = sum(resolution_outcome), success_prop = count_resolution_outcome / total_sum, median_ttr_days = median(TTR, na.rm = TRUE) / 24, median_comments_before_resolution = median(n_comments_before) ) library(ggplot2) library(ggdist) ggplot( dsl_df, aes( x=week_index, y=olmo_EP_prop_adac, color=isAuthorWMF ) ) + facet_grid(source ~ .) + geom_point() + geom_smooth() + scale_color_viridis_d() + theme_minimal() + labs(x = "Weeks from Release", y = "% of sentences machine-tagged as'Existent Problems'", title = "Proportion of 'Existent Problems' tags over time") dsl_df <- dsl_df |> mutate(priority = factor(priority, levels = c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage"))) ggplot(dsl_df, aes( fill=resolution_outcome, x=priority )) + facet_grid(~source) + geom_bar() + theme_minimal() signed_power <- function(x, p) { sign(x) * abs(x) ^ p } signed_log <- function(x) sign(x) * log1p(abs(x)) dsl_df <- dsl_df |> mutate( sp_med_pc3_adac = signed_power(median_PC3_adac, 0.2), sp_med_pc4_adac = signed_power(median_PC4_adac, 0.2), sl_med_pc4_adac = signed_log(median_PC4_adac), sl_med_pc3_adac = signed_log(median_PC3_adac) ) ggplot(dsl_df, aes( y= log1p(TTR/24), x=sl_med_pc4_adac, shape=isAuthorWMF, color=isAuthorWMF )) + facet_grid(~source) + theme_minimal() + geom_smooth(method="loess", span=0.5) + geom_point() + scale_color_viridis_d()