updated figures, resolved some DSL issues
This commit is contained in:
parent
3cfe103730
commit
07b6fa12b3
BIN
011025_dsl_coefs.png
Normal file
BIN
011025_dsl_coefs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 516 KiB |
@ -1,8 +1,40 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
|
library(stringr)
|
||||||
main_csv <-"~/analysis_data/121625_unified.csv"
|
main_csv <-"~/analysis_data/121625_unified.csv"
|
||||||
main_df <- read.csv(main_csv, header = TRUE)
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
#01-10-26 look for affil rosters
|
||||||
|
affils_ <- main_df |>
|
||||||
|
group_by(isAuthorWMF)|>
|
||||||
|
summarise(
|
||||||
|
n_authors = n_distinct(AuthorPHID),
|
||||||
|
.groups = "drop"
|
||||||
|
)
|
||||||
|
|
||||||
|
#01-09-26 looking for comments that say certain things:
|
||||||
|
relelvant_messages <- main_df |>
|
||||||
|
mutate(
|
||||||
|
substring_count = str_count(comment_text, "meeting")
|
||||||
|
) |>
|
||||||
|
filter(substring_count!= 0)
|
||||||
|
|
||||||
|
# 01-09-26
|
||||||
|
split_of_comments <- main_df |>
|
||||||
|
group_by(comment_type, source) |>
|
||||||
|
summarize(
|
||||||
|
count = n()
|
||||||
|
)
|
||||||
|
|
||||||
|
authors_count <- main_df |>
|
||||||
|
group_by(source, isAuthorWMF)|>
|
||||||
|
summarise(
|
||||||
|
n_authors = n_distinct(AuthorPHID),
|
||||||
|
.groups = "drop"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#below 01-09-26
|
||||||
bz_summary <- main_df |>
|
bz_summary <- main_df |>
|
||||||
mutate(isBz = if_else(
|
mutate(isBz = if_else(
|
||||||
AuthorPHID == "PHID-USER-idceizaw6elwiwm5xshb", TRUE, FALSE
|
AuthorPHID == "PHID-USER-idceizaw6elwiwm5xshb", TRUE, FALSE
|
||||||
@ -64,7 +96,7 @@ summary_df <- tasks_flagged %>%
|
|||||||
TRUE ~ NA
|
TRUE ~ NA
|
||||||
)
|
)
|
||||||
) |>
|
) |>
|
||||||
group_by(period, source, isAuthorWMF) %>%
|
group_by(period, source) %>%
|
||||||
summarize(
|
summarize(
|
||||||
total_tasks = n(),
|
total_tasks = n(),
|
||||||
first_time_tasks = sum(is_first_time_author),
|
first_time_tasks = sum(is_first_time_author),
|
||||||
|
|||||||
BIN
doc_plots/011025_machine_label_comparison.png
Normal file
BIN
doc_plots/011025_machine_label_comparison.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 408 KiB |
BIN
doc_plots/011025_tasks_created.png
Normal file
BIN
doc_plots/011025_tasks_created.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 370 KiB |
BIN
doc_plots/011025_ttr_boxplot.png
Normal file
BIN
doc_plots/011025_ttr_boxplot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 402 KiB |
10
dsl/dsl.R
10
dsl/dsl.R
@ -82,7 +82,7 @@ dev_model <- dsl(
|
|||||||
)
|
)
|
||||||
summary(dev_model)
|
summary(dev_model)
|
||||||
#saveRDS(dev_model, "120725_logit_dsl.RDS")
|
#saveRDS(dev_model, "120725_logit_dsl.RDS")
|
||||||
#dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
|
dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
|
||||||
library(broom)
|
library(broom)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
|
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
|
||||||
@ -149,9 +149,11 @@ dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
|
|||||||
y = "Variable") +
|
y = "Variable") +
|
||||||
theme_minimal()
|
theme_minimal()
|
||||||
ggsave(
|
ggsave(
|
||||||
filename = "120825_dsl_coefs.png",
|
filename = "011025_dsl_coefs.png",
|
||||||
plot = dsl_coefs,
|
plot = dsl_coefs,
|
||||||
width = 8, # inches
|
width = 8, # inches
|
||||||
height = 6, # inches
|
height = 4, # inches
|
||||||
dpi = 600 # high resolution
|
dpi = 800 # high resolution
|
||||||
)
|
)
|
||||||
|
|
||||||
|
library(texreg)
|
||||||
|
|||||||
@ -188,17 +188,17 @@ tasks_created <- ggplot(
|
|||||||
linetype = "3313", color = "black", linewidth = 0.5) +
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
|
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
|
||||||
geom_text(
|
geom_text(
|
||||||
data = subset(weekly_summary, source == "c1" & week_index == 6),
|
data = subset(weekly_summary, source == "c1" & week_index ==10),
|
||||||
aes(x=week_index, y=120, label='Opt-out deployment'),
|
aes(x=week_index, y=120, label='Opt-out deployment'),
|
||||||
size = 2.5) +
|
size = 3) +
|
||||||
geom_text(
|
geom_text(
|
||||||
data = subset(weekly_summary, source == "c1" & week_index == -33),
|
data = subset(weekly_summary, source == "c1" & week_index == -21),
|
||||||
aes(x=week_index, y=120, label='Opt-in Testing'),
|
aes(x=week_index, y=120, label='Opt-in Testing'),
|
||||||
size = 2.5) +
|
size = 3) +
|
||||||
geom_text(
|
geom_text(
|
||||||
data = subset(weekly_summary, source == "c2" & week_index == -12),
|
data = subset(weekly_summary, source == "c2" & week_index == -18),
|
||||||
aes(x=week_index, y=20, label='Deployment Announcement'),
|
aes(x=week_index, y=20, label='Deployment Announcement'),
|
||||||
size = 2.5) +
|
size = 3) +
|
||||||
theme_minimal() +
|
theme_minimal() +
|
||||||
scale_fill_viridis_d(
|
scale_fill_viridis_d(
|
||||||
breaks = c("FALSE", "TRUE", "BzImport"),
|
breaks = c("FALSE", "TRUE", "BzImport"),
|
||||||
@ -212,10 +212,10 @@ tasks_created <- ggplot(
|
|||||||
theme(legend.position = "top")
|
theme(legend.position = "top")
|
||||||
tasks_created
|
tasks_created
|
||||||
ggsave(
|
ggsave(
|
||||||
filename = "121625_tasks_created.png",
|
filename = "011025_tasks_created.png",
|
||||||
plot = tasks_created,
|
plot = tasks_created,
|
||||||
width = 12, # inches
|
width = 8, # inches
|
||||||
height = 6, # inches
|
height = 4, # inches
|
||||||
dpi = 800 # high resolution
|
dpi = 800 # high resolution
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
215
main_plot_script.R
Normal file
215
main_plot_script.R
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(dplyr)
|
||||||
|
library(tidyr)
|
||||||
|
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
|
||||||
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
#4.1
|
||||||
|
weekly_summary <- dsl_df |>
|
||||||
|
group_by(week_index, source, isAuthorWMF)|>
|
||||||
|
summarise(
|
||||||
|
tasks_made = sum(!is.na(resolution_outcome)),
|
||||||
|
count_resolution_outcome = sum(dsl_score),
|
||||||
|
author_closer_sum = sum(author_closer == TRUE),
|
||||||
|
median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
|
||||||
|
median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
|
||||||
|
median_olmo_RK_prop_adac = median(olmo_RK_prop_adac),
|
||||||
|
median_comments_before_resolution = median(n_comments_before)
|
||||||
|
) |>
|
||||||
|
mutate(isAuthorWMF = factor(isAuthorWMF, levels = c("FALSE", "BzImport", "TRUE")))
|
||||||
|
|
||||||
|
tasks_created <- ggplot(
|
||||||
|
weekly_summary,
|
||||||
|
aes(
|
||||||
|
x=week_index,
|
||||||
|
y=tasks_made,
|
||||||
|
fill=isAuthorWMF
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
facet_grid(source ~ .,
|
||||||
|
scales = "free_y",
|
||||||
|
labeller = labeller(source = c("c1" = "VisualEditor",
|
||||||
|
"c2" = "HTTPS-login",
|
||||||
|
"c3" = "HTTP-deprecation"))) +
|
||||||
|
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -29),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -9),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -4),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c2"),
|
||||||
|
aes(xintercept = -99),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c2"),
|
||||||
|
aes(xintercept = -4),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c3"),
|
||||||
|
aes(xintercept = -97),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c3"),
|
||||||
|
aes(xintercept = -3),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
|
||||||
|
geom_text(
|
||||||
|
data = subset(weekly_summary, source == "c1" & week_index ==10),
|
||||||
|
aes(x=week_index, y=120, label='Opt-out deployment'),
|
||||||
|
size = 3) +
|
||||||
|
geom_text(
|
||||||
|
data = subset(weekly_summary, source == "c1" & week_index == -21),
|
||||||
|
aes(x=week_index, y=120, label='Opt-in Testing'),
|
||||||
|
size = 3) +
|
||||||
|
geom_text(
|
||||||
|
data = subset(weekly_summary, source == "c2" & week_index == -18),
|
||||||
|
aes(x=week_index, y=20, label='Deployment Announcement'),
|
||||||
|
size = 3) +
|
||||||
|
theme_minimal() +
|
||||||
|
scale_fill_viridis_d(
|
||||||
|
breaks = c("FALSE", "TRUE", "BzImport"),
|
||||||
|
labels = c("Nonaffiliate", "WMF-affiliate", "BzImport")
|
||||||
|
) +
|
||||||
|
labs(
|
||||||
|
x = "Weeks from Feature Deployment",
|
||||||
|
y = "Count of Tasks Created",
|
||||||
|
fill = "Task Author"
|
||||||
|
) +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
tasks_created
|
||||||
|
ggsave(
|
||||||
|
filename = "011025_tasks_created.png",
|
||||||
|
plot = tasks_created,
|
||||||
|
width = 8, # inches
|
||||||
|
height = 4, # inches
|
||||||
|
dpi = 800 # high resolution
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#4.2 plot comparing the TTR for different things
|
||||||
|
ttr_trajectory <- dsl_df |>
|
||||||
|
mutate(ttr_weeks = TTR_hours / 168) |>
|
||||||
|
mutate(isTriaged = if_else(priority == 'Needs Triage',
|
||||||
|
"Not Triaged",
|
||||||
|
"Triaged")) |>
|
||||||
|
group_by(week_index, isTriaged, source) |>
|
||||||
|
summarise(
|
||||||
|
count = n(),
|
||||||
|
mean_ttr = mean(ttr_weeks, na.rm = TRUE),
|
||||||
|
sd_ttr = sd(ttr_weeks, na.rm = TRUE)
|
||||||
|
)
|
||||||
|
ttr_trajectory_plot <- ttr_trajectory |>
|
||||||
|
filter(week_index >= -13) |>
|
||||||
|
filter(isTriaged == "Not Triaged") |>
|
||||||
|
ggplot(aes(x = week_index)) +
|
||||||
|
# Line for mean TTR
|
||||||
|
geom_line(aes(y = mean_ttr, color = "Mean TTR"), linewidth = 1) +
|
||||||
|
# Ribbon for standard deviation
|
||||||
|
geom_ribbon(aes(ymin = mean_ttr - sd_ttr, ymax = mean_ttr + sd_ttr),
|
||||||
|
fill = "lightblue", alpha = 0.4) +
|
||||||
|
# Line for count of tasks
|
||||||
|
geom_point(aes(y = count,
|
||||||
|
color = "Count of New Tasks"), linewidth = 1, linetype = "dashed") +
|
||||||
|
# Facet the plot by source and triaged status
|
||||||
|
facet_wrap(source ~ isTriaged, scales = "free_y") +
|
||||||
|
labs(
|
||||||
|
title = "TTR by Source and Triage Status (TODO)",
|
||||||
|
x = "Week Index",
|
||||||
|
y = "Mean TTR (in weeks)",
|
||||||
|
color = "Metrics"
|
||||||
|
) +
|
||||||
|
scale_color_manual(values = c("Mean TTR" = "blue", "Count of New Tasks" = "red")) +
|
||||||
|
theme_minimal()
|
||||||
|
ttr_trajectory_plot
|
||||||
|
|
||||||
|
|
||||||
|
ttr_boxplot <- dsl_df |>
|
||||||
|
filter(priority == "Needs Triage" |
|
||||||
|
priority == "Unbreak Now!" |
|
||||||
|
priority == "High") |>
|
||||||
|
filter(week_index >= -13) |>
|
||||||
|
ggplot(
|
||||||
|
aes(
|
||||||
|
x=as.factor(week_index),
|
||||||
|
y= TTR_hours/168,
|
||||||
|
color=priority,
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
facet_grid(source ~ .,
|
||||||
|
scales = "free_y",
|
||||||
|
labeller = labeller(source = c("c1" = "VisualEditor",
|
||||||
|
"c2" = "HTTPS-login",
|
||||||
|
"c3" = "HTTP-deprecation"))) +
|
||||||
|
geom_boxplot(outlier.shape = NA) +
|
||||||
|
theme_minimal() +
|
||||||
|
coord_cartesian(ylim = c(0, 112)) +
|
||||||
|
geom_text(
|
||||||
|
data = subset(dsl_df |>
|
||||||
|
filter(priority == "Needs Triage" |
|
||||||
|
priority == "Unbreak Now!" |
|
||||||
|
priority == "High"), source == "c1" & week_index == 12),
|
||||||
|
aes(x=week_index, y=80, label='Opt-in Testing'),
|
||||||
|
color = "black",
|
||||||
|
size = 3) +
|
||||||
|
geom_vline(xintercept =14, linetype = "dashed", color = "black", linewidth = 0.5) +
|
||||||
|
scale_color_viridis_d(option='turbo') +
|
||||||
|
labs(x = "Weeks from Release",
|
||||||
|
y = "Time to Resolution (weeks)",
|
||||||
|
color = "Priority Tag") +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
ttr_boxplot
|
||||||
|
ggsave(
|
||||||
|
filename = "011025_ttr_boxplot.png",
|
||||||
|
plot = ttr_boxplot,
|
||||||
|
width = 8, # inches
|
||||||
|
height = 4, # inches
|
||||||
|
dpi = 800 # high resolution
|
||||||
|
)
|
||||||
|
#4.3 plot comparing machine labels of information type
|
||||||
|
dsl_df <- dsl_df |>
|
||||||
|
filter(isAuthorWMF != "BzImport")
|
||||||
|
|
||||||
|
dsl_df_long <- dsl_df %>%
|
||||||
|
pivot_longer(
|
||||||
|
cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
|
||||||
|
names_to = "tag",
|
||||||
|
values_to = "proportion"
|
||||||
|
) %>%
|
||||||
|
mutate(tag = gsub("olmo_|_prop_adac", "", tag),
|
||||||
|
tag = case_when(
|
||||||
|
tag == "EP" ~ "Existent Problem",
|
||||||
|
tag == "RK" ~ "Record Keeping",
|
||||||
|
tag =="TSOL" ~ "Solutions"
|
||||||
|
))
|
||||||
|
|
||||||
|
olmo_comparison <- ggplot(
|
||||||
|
dsl_df_long,
|
||||||
|
aes(
|
||||||
|
x = tag,
|
||||||
|
y = proportion,
|
||||||
|
fill = isAuthorWMF,
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
facet_grid(source ~ .,
|
||||||
|
scales = "free_y",
|
||||||
|
labeller = labeller(source = c("c1" = "VisualEditor",
|
||||||
|
"c2" = "HTTPS-login",
|
||||||
|
"c3" = "HTTP-deprecation"))) +
|
||||||
|
geom_boxplot() +
|
||||||
|
theme_minimal() +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
labs(
|
||||||
|
x = "Issue Information Type Category",
|
||||||
|
y = "% of sentences machine-labeled",
|
||||||
|
color = "Is Author WMF?",
|
||||||
|
fill = "Is Author WMF?"
|
||||||
|
) +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
olmo_comparison
|
||||||
|
ggsave(
|
||||||
|
filename = "011025_machine_label_comparison.png",
|
||||||
|
plot = olmo_comparison,
|
||||||
|
width = 8, # inches
|
||||||
|
height = 4, # inches
|
||||||
|
dpi = 800 # high resolution
|
||||||
|
)
|
||||||
17
mgaughan-rstudio-server_32251441.out
Normal file
17
mgaughan-rstudio-server_32251441.out
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3443:42777 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: u+Vtuz9i8I2EYxQXIDps
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 32251441
|
||||||
Loading…
Reference in New Issue
Block a user