simple bivariate plots to look at variance, or lack thereof.

2025-10-07 15:00:59 -07:00 · 2025-10-07 15:00:59 -07:00 · 840b32a2e4
commit 840b32a2e4
parent 6fb1801b2a
4 changed files with 3226 additions and 21 deletions
--- a/analysis_data/100725_bivariate_data.csv
+++ b/analysis_data/100725_bivariate_data.csv
--- a/analysis_data/100725_bivariate_plots.R
+++ b/analysis_data/100725_bivariate_plots.R
@ -0,0 +1,94 @@
+library(tidyverse)
+library(jsonlite)
+library(lubridate)
+## TODO: get the within-case seniority
+main_csv <-"~/analysis_data/100725_bivariate_data.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+#task_df <- main_df |>
+#  filter(comment_type == "task_description")
+
+main_df <- main_df |>
+  mutate(new_priority_score = ifelse(priority_score == 90, NA, priority_score))
+
+library(ggdist)
+
+ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) +
+  facet_grid(~source) +
+  geom_dots(side = "both", layout = "hex", stackratio = 0.92) +
+  scale_fill_viridis_d() + 
+  xlim(-130, 15) + 
+  theme_minimal() +
+  labs(
+    title = "PCs for Task Descriptions (Faceted by Source and Phase)",
+    x = "week_index",
+    y = "Priority_Score",
+    fill = "isAuthorWMF?"
+  )
+
+library(ggplot2)
+
+ggplot(main_df, aes(y = new_priority_score, x = resolution_outcome, fill = isAuthorWMF)) +
+  facet_grid(source~phase) +
+  stat_histinterval()+
+  theme_minimal() +
+  labs(
+    title = "Histogram of triaged priority scores by task outcome and affiliation (faceted by source and phase)",
+    x = "on-time resolution (wide release date +90 days)",
+    y = "priority score (post-triage)",
+    fill = "isTaskAuthorWMF?"
+  )
+
+main_df %>%
+  count(phase, source, resolution_outcome, isAuthorWMF) %>%
+  ggplot(aes(
+    y = resolution_outcome,
+    x = isAuthorWMF,
+    fill = n,
+    label = n
+  )) +
+  facet_grid(source ~ phase) +
+  geom_tile() +
+  geom_text(size = 5, color = "white") +
+  scale_fill_viridis_c() +
+  theme_minimal() + 
+  labs(
+    title = "Count of Tasks by on-time resolution",
+    x = "isTaskAuthorWMF?",
+    y = "on-time resolution (wide release date +90 days)",
+    fill = "count of tasks"
+  )
+
+ggplot(main_df, aes(y = resolution_outcome, x = week_index, fill = isAuthorWMF)) +
+  facet_grid(~source) +
+  stat_dots(position = "dodgejust", quantiles = 100, color = NA) +
+  theme_minimal() +
+  labs(
+    title = "centile (1/100) distribution dot plot of tasks (faceted by source)",
+    x = "week_index of task filed",
+    y = "on-time resolution (wide release date +90 days) ",
+    fill = "isTaskAuthorWMF?"
+  )
+
+ggplot(main_df, aes(y = priority_score, x = week_index, color = resolution_outcome)) +
+  facet_grid(~source) +
+  geom_point() +
+  geom_smooth() + 
+  theme_minimal() +
+  labs(
+    title = "week_index x priority_score(faceted by source)",
+    x = "week_index of task filed",
+    y = "triaged priority score",
+    color = "on-time resolution (wide release date +90 days)"
+  )
+
+ggplot(main_df, aes(y = priority_score, x = median_gerrit_reviewers, color = isAuthorWMF)) +
+  facet_grid(~source) +
+  geom_point() +
+  theme_minimal() +
+  labs(
+    title = "gerrit reviewers x priority_score(faceted by source)",
+    x = "median # of gerrit reviewers for linked PRs",
+    y = "triaged priority score",
+    color = "isAuthorWMF?"
+  )
--- a/dsl/dsl_data_transform.R
+++ b/dsl/dsl_data_transform.R
@ -31,12 +31,12 @@ human_result <- human_df %>%
 # for each task filer, 
 # GET the proportion of Observed bug behavior  + Expected Behavior
 # GET the proportion of Solution Discussion  + Solution Usage 
-main_csv <-"~/analysis_data/092925_unified_phab.csv"
+main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
 main_df <- read.csv(main_csv, header = TRUE) 

 closed_relevance_summary <- main_df %>%
  filter(comment_type == "task_description") %>%
-  select(TaskPHID, closed_relevance, priority_score, source, phase, week_index)
+  select(TaskPHID, resolution_outcome, priority_score, priority, source, phase, week_index, isAuthorWMF)

 # TODO: need to get the TaskAuthor's comments, not just the big picture
 machine_result <- main_df %>%
@ -51,9 +51,9 @@ machine_result <- main_df %>%
    n_comments = sum(comment_type == "task_subcomment")
  ) |>
  left_join(closed_relevance_summary, by = "TaskPHID") |>
-  mutate(dsl_score = ifelse(closed_relevance == "True", 1, 0))
+  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0))

 output_df <- machine_result |>
  left_join(human_result, by = "TaskPHID")

-write.csv(output_df, "093025_power_dsl.csv", row.names = FALSE)
+write.csv(machine_result, "100725_bivariate_data.csv", row.names = FALSE)
--- a/mgaughan-rstudio-server_29987861.out
+++ b/mgaughan-rstudio-server_29987861.out
@ -1,17 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3441:37935 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: vOc4KzJoEbZDstjf4p5Q
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 29987861