1
0

simple bivariate plots to look at variance, or lack thereof.

This commit is contained in:
Matthew Gaughan 2025-10-07 15:00:59 -07:00
parent 6fb1801b2a
commit 840b32a2e4
4 changed files with 3226 additions and 21 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,94 @@
library(tidyverse)
library(jsonlite)
library(lubridate)
## TODO: get the within-case seniority
main_csv <-"~/analysis_data/100725_bivariate_data.csv"
main_df <- read.csv(main_csv, header = TRUE)
#task_df <- main_df |>
# filter(comment_type == "task_description")
main_df <- main_df |>
mutate(new_priority_score = ifelse(priority_score == 90, NA, priority_score))
library(ggdist)
ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) +
facet_grid(~source) +
geom_dots(side = "both", layout = "hex", stackratio = 0.92) +
scale_fill_viridis_d() +
xlim(-130, 15) +
theme_minimal() +
labs(
title = "PCs for Task Descriptions (Faceted by Source and Phase)",
x = "week_index",
y = "Priority_Score",
fill = "isAuthorWMF?"
)
library(ggplot2)
ggplot(main_df, aes(y = new_priority_score, x = resolution_outcome, fill = isAuthorWMF)) +
facet_grid(source~phase) +
stat_histinterval()+
theme_minimal() +
labs(
title = "Histogram of triaged priority scores by task outcome and affiliation (faceted by source and phase)",
x = "on-time resolution (wide release date +90 days)",
y = "priority score (post-triage)",
fill = "isTaskAuthorWMF?"
)
main_df %>%
count(phase, source, resolution_outcome, isAuthorWMF) %>%
ggplot(aes(
y = resolution_outcome,
x = isAuthorWMF,
fill = n,
label = n
)) +
facet_grid(source ~ phase) +
geom_tile() +
geom_text(size = 5, color = "white") +
scale_fill_viridis_c() +
theme_minimal() +
labs(
title = "Count of Tasks by on-time resolution",
x = "isTaskAuthorWMF?",
y = "on-time resolution (wide release date +90 days)",
fill = "count of tasks"
)
ggplot(main_df, aes(y = resolution_outcome, x = week_index, fill = isAuthorWMF)) +
facet_grid(~source) +
stat_dots(position = "dodgejust", quantiles = 100, color = NA) +
theme_minimal() +
labs(
title = "centile (1/100) distribution dot plot of tasks (faceted by source)",
x = "week_index of task filed",
y = "on-time resolution (wide release date +90 days) ",
fill = "isTaskAuthorWMF?"
)
ggplot(main_df, aes(y = priority_score, x = week_index, color = resolution_outcome)) +
facet_grid(~source) +
geom_point() +
geom_smooth() +
theme_minimal() +
labs(
title = "week_index x priority_score(faceted by source)",
x = "week_index of task filed",
y = "triaged priority score",
color = "on-time resolution (wide release date +90 days)"
)
ggplot(main_df, aes(y = priority_score, x = median_gerrit_reviewers, color = isAuthorWMF)) +
facet_grid(~source) +
geom_point() +
theme_minimal() +
labs(
title = "gerrit reviewers x priority_score(faceted by source)",
x = "median # of gerrit reviewers for linked PRs",
y = "triaged priority score",
color = "isAuthorWMF?"
)

View File

@ -31,12 +31,12 @@ human_result <- human_df %>%
# for each task filer,
# GET the proportion of Observed bug behavior + Expected Behavior
# GET the proportion of Solution Discussion + Solution Usage
main_csv <-"~/analysis_data/092925_unified_phab.csv"
main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)
closed_relevance_summary <- main_df %>%
filter(comment_type == "task_description") %>%
select(TaskPHID, closed_relevance, priority_score, source, phase, week_index)
select(TaskPHID, resolution_outcome, priority_score, priority, source, phase, week_index, isAuthorWMF)
# TODO: need to get the TaskAuthor's comments, not just the big picture
machine_result <- main_df %>%
@ -51,9 +51,9 @@ machine_result <- main_df %>%
n_comments = sum(comment_type == "task_subcomment")
) |>
left_join(closed_relevance_summary, by = "TaskPHID") |>
mutate(dsl_score = ifelse(closed_relevance == "True", 1, 0))
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0))
output_df <- machine_result |>
left_join(human_result, by = "TaskPHID")
write.csv(output_df, "093025_power_dsl.csv", row.names = FALSE)
write.csv(machine_result, "100725_bivariate_data.csv", row.names = FALSE)

View File

@ -1,17 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3441:37935 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: vOc4KzJoEbZDstjf4p5Q
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 29987861