1
0

adding some more metadata to the DSL aggregation files

This commit is contained in:
Matthew Gaughan 2025-11-10 14:32:14 -08:00
parent be587982d7
commit 7555259a3e
5 changed files with 28197 additions and 25049 deletions

File diff suppressed because one or more lines are too long

View File

@ -40,6 +40,9 @@ main_df <- main_df |>
!is.na(task_desc_author) &
AuthorPHID == task_desc_author &
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
),
before_close = as.integer(
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
)
)
# add dictionary values

3130
dsl/110925_DSL_df_adac.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
library(tidyverse)
unified_csv <-"~/analysis_data/102725_unified.csv"
unified_csv <-"~/analysis_data/110925_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
# 1. aggregate to the task level
@ -223,12 +223,14 @@ task_level_variables <- unified_df |>
median_PC4 = median(PC4),
median_PC4_adac = median(PC4[ADAC==1]),
median_PC4_no_adac = median(PC4[ADAC==0]),
n_comments = sum(!is.na(id)),
n_comments_before = sum(before_close)
)
descriptions <- unified_df |>
filter(comment_type == "task_description")|>
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
source, phase, week_index, author_closer, resolution_outcome )
source, phase, week_index, author_closer, resolution_outcome, priority )
task_level_variables <- task_level_variables |>
left_join(
@ -286,21 +288,6 @@ ggplot(task_level_variables, aes(
fill = "Resolution Outcome"
)
ggplot(task_level_variables, aes(
x = median_PC3_ADAC,
y = TTR,
fill = isAuthorWMF
)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-20, 20) +
ylim(0, 1440) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Median PC3 Value in ADAC Comments",
x = "Median PC3 Value",
y = "Time to Resolution (up to 60 days)",
)
# 4. save
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
write.csv(task_level_variables, "110925_DSL_df_adac.csv", row.names = FALSE)

View File

@ -1,16 +1,18 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
outcome_summary <- dsl_df |>
group_by(source, isAuthorWMF)|>
summarise(
total_sum = sum(!is.na(resolution_outcome)),
count_resolution_outcome = sum(resolution_outcome),
success_prop = count_resolution_outcome / total_sum,
median_ttr_days = median(TTR, na.rm = TRUE) / 24
median_ttr_days = median(TTR, na.rm = TRUE) / 24,
median_comments_before_resolution = median(n_comments_before)
)
@ -18,6 +20,32 @@ library(ggplot2)
library(ggdist)
ggplot(
dsl_df,
aes(
x=n_comments_before,
color=source,
fill=source
)
) +
facet_grid(~isAuthorWMF) +
stat_halfeye() +
theme_minimal()
dsl_df <- dsl_df |>
mutate(priority = factor(priority,
levels = c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")))
ggplot(dsl_df,
aes(
fill=resolution_outcome,
x=priority
)) +
facet_grid(~source) +
geom_bar() +
theme_minimal()
signed_power <- function(x, p) {
sign(x) * abs(x) ^ p
}