adding some more metadata to the DSL aggregation files

2025-11-10 14:32:14 -08:00 · 2025-11-10 14:32:14 -08:00 · 7555259a3e
commit 7555259a3e
parent be587982d7
5 changed files with 28197 additions and 25049 deletions
--- a/analysis_data/110925_unified.csv
+++ b/analysis_data/110925_unified.csv
--- a/analysis_data/data_verification_3.R
+++ b/analysis_data/data_verification_3.R
@ -40,6 +40,9 @@ main_df <- main_df |>
      !is.na(task_desc_author) &
        AuthorPHID == task_desc_author &
        (is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
+    ),
+    before_close = as.integer(
+      (is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
    )
  )
 # add dictionary values 
--- a/dsl/110925_DSL_df_adac.csv
+++ b/dsl/110925_DSL_df_adac.csv
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,6 +1,6 @@
 library(tidyverse)

-unified_csv <-"~/analysis_data/102725_unified.csv"
+unified_csv <-"~/analysis_data/110925_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 

 # 1. aggregate to the task level 
@ -223,12 +223,14 @@ task_level_variables <- unified_df |>
            median_PC4 = median(PC4),
            median_PC4_adac = median(PC4[ADAC==1]),
            median_PC4_no_adac = median(PC4[ADAC==0]),
+            n_comments = sum(!is.na(id)),
+            n_comments_before = sum(before_close)
            )

 descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
-         source, phase, week_index, author_closer, resolution_outcome )
+         source, phase, week_index, author_closer, resolution_outcome, priority )

 task_level_variables <- task_level_variables |>
  left_join(
@ -286,21 +288,6 @@ ggplot(task_level_variables, aes(
    fill = "Resolution Outcome"
  )

-ggplot(task_level_variables, aes(
-  x = median_PC3_ADAC, 
-  y = TTR, 
-  fill = isAuthorWMF
-  )) +
-  facet_grid(~source, scales="fixed") +
-  geom_point(shape = 21, alpha=0.3, size=2) +
-  xlim(-20, 20) + 
-  ylim(0, 1440) + 
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "Median PC3 Value in ADAC Comments",
-    x = "Median PC3 Value",
-    y = "Time to Resolution (up to 60 days)",
-  )
+
 # 4. save
-write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
+write.csv(task_level_variables, "110925_DSL_df_adac.csv", row.names = FALSE)
--- a/dsl/final_bivariate.R
+++ b/dsl/final_bivariate.R
@ -1,16 +1,18 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
+dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

+
 outcome_summary <- dsl_df |>
  group_by(source, isAuthorWMF)|>
  summarise(
    total_sum = sum(!is.na(resolution_outcome)),
    count_resolution_outcome = sum(resolution_outcome),
    success_prop = count_resolution_outcome / total_sum,
-    median_ttr_days = median(TTR, na.rm = TRUE) / 24
+    median_ttr_days = median(TTR, na.rm = TRUE) / 24,
+    median_comments_before_resolution = median(n_comments_before)
    )


@ -18,6 +20,32 @@ library(ggplot2)
 library(ggdist)


+ggplot(
+  dsl_df, 
+  aes(
+    x=n_comments_before,
+    color=source,
+    fill=source
+  )
+) +
+  facet_grid(~isAuthorWMF) +
+  stat_halfeye() +
+  theme_minimal() 
+
+dsl_df <- dsl_df |>
+  mutate(priority = factor(priority, 
+         levels = c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")))
+
+ggplot(dsl_df, 
+       aes(
+         fill=resolution_outcome, 
+         x=priority
+       )) + 
+  facet_grid(~source) +
+  geom_bar() +
+theme_minimal() 
+
+
 signed_power <- function(x, p) {
  sign(x) * abs(x) ^ p
 }