diff --git a/011025_dsl_coefs.png b/011025_dsl_coefs.png new file mode 100644 index 0000000..7903d0a Binary files /dev/null and b/011025_dsl_coefs.png differ diff --git a/analysis_data/scratch.R b/analysis_data/scratch.R index 27f3d2a..8a56d8a 100644 --- a/analysis_data/scratch.R +++ b/analysis_data/scratch.R @@ -1,8 +1,40 @@ library(tidyverse) library(dplyr) +library(stringr) main_csv <-"~/analysis_data/121625_unified.csv" main_df <- read.csv(main_csv, header = TRUE) +#01-10-26 look for affil rosters +affils_ <- main_df |> + group_by(isAuthorWMF)|> + summarise( + n_authors = n_distinct(AuthorPHID), + .groups = "drop" + ) + +#01-09-26 looking for comments that say certain things: +relelvant_messages <- main_df |> + mutate( + substring_count = str_count(comment_text, "meeting") + ) |> + filter(substring_count!= 0) + +# 01-09-26 +split_of_comments <- main_df |> + group_by(comment_type, source) |> + summarize( + count = n() + ) + +authors_count <- main_df |> + group_by(source, isAuthorWMF)|> + summarise( + n_authors = n_distinct(AuthorPHID), + .groups = "drop" + ) + + +#below 01-09-26 bz_summary <- main_df |> mutate(isBz = if_else( AuthorPHID == "PHID-USER-idceizaw6elwiwm5xshb", TRUE, FALSE @@ -64,7 +96,7 @@ summary_df <- tasks_flagged %>% TRUE ~ NA ) ) |> - group_by(period, source, isAuthorWMF) %>% + group_by(period, source) %>% summarize( total_tasks = n(), first_time_tasks = sum(is_first_time_author), diff --git a/doc_plots/011025_machine_label_comparison.png b/doc_plots/011025_machine_label_comparison.png new file mode 100644 index 0000000..ebf0f44 Binary files /dev/null and b/doc_plots/011025_machine_label_comparison.png differ diff --git a/doc_plots/011025_tasks_created.png b/doc_plots/011025_tasks_created.png new file mode 100644 index 0000000..b09c878 Binary files /dev/null and b/doc_plots/011025_tasks_created.png differ diff --git a/doc_plots/011025_ttr_boxplot.png b/doc_plots/011025_ttr_boxplot.png new file mode 100644 index 0000000..d0d20bc Binary files /dev/null and b/doc_plots/011025_ttr_boxplot.png differ diff --git a/dsl/dsl.R b/dsl/dsl.R index c4e7059..a02a76a 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -82,7 +82,7 @@ dev_model <- dsl( ) summary(dev_model) #saveRDS(dev_model, "120725_logit_dsl.RDS") -#dev_model <- readRDS("dsl/120725_logit_dsl.RDS") +dev_model <- readRDS("dsl/121625_logit_dsl.RDS") library(broom) library(dplyr) tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { @@ -149,9 +149,11 @@ dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) + y = "Variable") + theme_minimal() ggsave( - filename = "120825_dsl_coefs.png", + filename = "011025_dsl_coefs.png", plot = dsl_coefs, width = 8, # inches - height = 6, # inches - dpi = 600 # high resolution + height = 4, # inches + dpi = 800 # high resolution ) + +library(texreg) diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R index 90d495e..8511dcd 100644 --- a/dsl/final_bivariate.R +++ b/dsl/final_bivariate.R @@ -188,17 +188,17 @@ tasks_created <- ggplot( linetype = "3313", color = "black", linewidth = 0.5) + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + geom_text( - data = subset(weekly_summary, source == "c1" & week_index == 6), + data = subset(weekly_summary, source == "c1" & week_index ==10), aes(x=week_index, y=120, label='Opt-out deployment'), - size = 2.5) + + size = 3) + geom_text( - data = subset(weekly_summary, source == "c1" & week_index == -33), + data = subset(weekly_summary, source == "c1" & week_index == -21), aes(x=week_index, y=120, label='Opt-in Testing'), - size = 2.5) + + size = 3) + geom_text( - data = subset(weekly_summary, source == "c2" & week_index == -12), + data = subset(weekly_summary, source == "c2" & week_index == -18), aes(x=week_index, y=20, label='Deployment Announcement'), - size = 2.5) + + size = 3) + theme_minimal() + scale_fill_viridis_d( breaks = c("FALSE", "TRUE", "BzImport"), @@ -212,10 +212,10 @@ tasks_created <- ggplot( theme(legend.position = "top") tasks_created ggsave( - filename = "121625_tasks_created.png", + filename = "011025_tasks_created.png", plot = tasks_created, - width = 12, # inches - height = 6, # inches + width = 8, # inches + height = 4, # inches dpi = 800 # high resolution ) diff --git a/main_plot_script.R b/main_plot_script.R new file mode 100644 index 0000000..8b54f86 --- /dev/null +++ b/main_plot_script.R @@ -0,0 +1,215 @@ +library(tidyverse) +library(dplyr) +library(tidyr) +dsl_csv <-"~/dsl/121625_DSL_frame.csv" +dsl_df <- read.csv(dsl_csv, header = TRUE) +#4.1 +weekly_summary <- dsl_df |> + group_by(week_index, source, isAuthorWMF)|> + summarise( + tasks_made = sum(!is.na(resolution_outcome)), + count_resolution_outcome = sum(dsl_score), + author_closer_sum = sum(author_closer == TRUE), + median_olmo_EP_prop_adac = median(olmo_EP_prop_adac), + median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac), + median_olmo_RK_prop_adac = median(olmo_RK_prop_adac), + median_comments_before_resolution = median(n_comments_before) + ) |> + mutate(isAuthorWMF = factor(isAuthorWMF, levels = c("FALSE", "BzImport", "TRUE"))) + +tasks_created <- ggplot( + weekly_summary, + aes( + x=week_index, + y=tasks_made, + fill=isAuthorWMF + ) +) + + facet_grid(source ~ ., + scales = "free_y", + labeller = labeller(source = c("c1" = "VisualEditor", + "c2" = "HTTPS-login", + "c3" = "HTTP-deprecation"))) + + geom_col(position = position_dodge(width = 0.9), width = 0.8) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -29), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -9), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -4), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c2"), + aes(xintercept = -99), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c2"), + aes(xintercept = -4), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c3"), + aes(xintercept = -97), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c3"), + aes(xintercept = -3), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + + geom_text( + data = subset(weekly_summary, source == "c1" & week_index ==10), + aes(x=week_index, y=120, label='Opt-out deployment'), + size = 3) + + geom_text( + data = subset(weekly_summary, source == "c1" & week_index == -21), + aes(x=week_index, y=120, label='Opt-in Testing'), + size = 3) + + geom_text( + data = subset(weekly_summary, source == "c2" & week_index == -18), + aes(x=week_index, y=20, label='Deployment Announcement'), + size = 3) + + theme_minimal() + + scale_fill_viridis_d( + breaks = c("FALSE", "TRUE", "BzImport"), + labels = c("Nonaffiliate", "WMF-affiliate", "BzImport") + ) + + labs( + x = "Weeks from Feature Deployment", + y = "Count of Tasks Created", + fill = "Task Author" + ) + + theme(legend.position = "top") +tasks_created +ggsave( + filename = "011025_tasks_created.png", + plot = tasks_created, + width = 8, # inches + height = 4, # inches + dpi = 800 # high resolution +) + + +#4.2 plot comparing the TTR for different things +ttr_trajectory <- dsl_df |> + mutate(ttr_weeks = TTR_hours / 168) |> + mutate(isTriaged = if_else(priority == 'Needs Triage', + "Not Triaged", + "Triaged")) |> + group_by(week_index, isTriaged, source) |> + summarise( + count = n(), + mean_ttr = mean(ttr_weeks, na.rm = TRUE), + sd_ttr = sd(ttr_weeks, na.rm = TRUE) + ) +ttr_trajectory_plot <- ttr_trajectory |> + filter(week_index >= -13) |> + filter(isTriaged == "Not Triaged") |> + ggplot(aes(x = week_index)) + + # Line for mean TTR + geom_line(aes(y = mean_ttr, color = "Mean TTR"), linewidth = 1) + + # Ribbon for standard deviation + geom_ribbon(aes(ymin = mean_ttr - sd_ttr, ymax = mean_ttr + sd_ttr), + fill = "lightblue", alpha = 0.4) + + # Line for count of tasks + geom_point(aes(y = count, + color = "Count of New Tasks"), linewidth = 1, linetype = "dashed") + + # Facet the plot by source and triaged status + facet_wrap(source ~ isTriaged, scales = "free_y") + + labs( + title = "TTR by Source and Triage Status (TODO)", + x = "Week Index", + y = "Mean TTR (in weeks)", + color = "Metrics" + ) + + scale_color_manual(values = c("Mean TTR" = "blue", "Count of New Tasks" = "red")) + + theme_minimal() +ttr_trajectory_plot + + +ttr_boxplot <- dsl_df |> + filter(priority == "Needs Triage" | + priority == "Unbreak Now!" | + priority == "High") |> + filter(week_index >= -13) |> + ggplot( + aes( + x=as.factor(week_index), + y= TTR_hours/168, + color=priority, + ) + ) + + facet_grid(source ~ ., + scales = "free_y", + labeller = labeller(source = c("c1" = "VisualEditor", + "c2" = "HTTPS-login", + "c3" = "HTTP-deprecation"))) + + geom_boxplot(outlier.shape = NA) + + theme_minimal() + + coord_cartesian(ylim = c(0, 112)) + + geom_text( + data = subset(dsl_df |> + filter(priority == "Needs Triage" | + priority == "Unbreak Now!" | + priority == "High"), source == "c1" & week_index == 12), + aes(x=week_index, y=80, label='Opt-in Testing'), + color = "black", + size = 3) + + geom_vline(xintercept =14, linetype = "dashed", color = "black", linewidth = 0.5) + + scale_color_viridis_d(option='turbo') + + labs(x = "Weeks from Release", + y = "Time to Resolution (weeks)", + color = "Priority Tag") + + theme(legend.position = "top") +ttr_boxplot +ggsave( + filename = "011025_ttr_boxplot.png", + plot = ttr_boxplot, + width = 8, # inches + height = 4, # inches + dpi = 800 # high resolution +) +#4.3 plot comparing machine labels of information type +dsl_df <- dsl_df |> + filter(isAuthorWMF != "BzImport") + +dsl_df_long <- dsl_df %>% + pivot_longer( + cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac), + names_to = "tag", + values_to = "proportion" + ) %>% + mutate(tag = gsub("olmo_|_prop_adac", "", tag), + tag = case_when( + tag == "EP" ~ "Existent Problem", + tag == "RK" ~ "Record Keeping", + tag =="TSOL" ~ "Solutions" + )) + +olmo_comparison <- ggplot( + dsl_df_long, + aes( + x = tag, + y = proportion, + fill = isAuthorWMF, + ) +) + + facet_grid(source ~ ., + scales = "free_y", + labeller = labeller(source = c("c1" = "VisualEditor", + "c2" = "HTTPS-login", + "c3" = "HTTP-deprecation"))) + + geom_boxplot() + + theme_minimal() + + scale_fill_viridis_d() + + labs( + x = "Issue Information Type Category", + y = "% of sentences machine-labeled", + color = "Is Author WMF?", + fill = "Is Author WMF?" + ) + + theme(legend.position = "top") +olmo_comparison +ggsave( + filename = "011025_machine_label_comparison.png", + plot = olmo_comparison, + width = 8, # inches + height = 4, # inches + dpi = 800 # high resolution +) diff --git a/mgaughan-rstudio-server_32251441.out b/mgaughan-rstudio-server_32251441.out new file mode 100644 index 0000000..5a79413 --- /dev/null +++ b/mgaughan-rstudio-server_32251441.out @@ -0,0 +1,17 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3443:42777 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: u+Vtuz9i8I2EYxQXIDps + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 32251441