simple bivariate plots to look at variance, or lack thereof.
This commit is contained in:
parent
6fb1801b2a
commit
840b32a2e4
3128
analysis_data/100725_bivariate_data.csv
Normal file
3128
analysis_data/100725_bivariate_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
94
analysis_data/100725_bivariate_plots.R
Normal file
94
analysis_data/100725_bivariate_plots.R
Normal file
@ -0,0 +1,94 @@
|
||||
library(tidyverse)
|
||||
library(jsonlite)
|
||||
library(lubridate)
|
||||
## TODO: get the within-case seniority
|
||||
main_csv <-"~/analysis_data/100725_bivariate_data.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
#task_df <- main_df |>
|
||||
# filter(comment_type == "task_description")
|
||||
|
||||
main_df <- main_df |>
|
||||
mutate(new_priority_score = ifelse(priority_score == 90, NA, priority_score))
|
||||
|
||||
library(ggdist)
|
||||
|
||||
ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) +
|
||||
facet_grid(~source) +
|
||||
geom_dots(side = "both", layout = "hex", stackratio = 0.92) +
|
||||
scale_fill_viridis_d() +
|
||||
xlim(-130, 15) +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for Task Descriptions (Faceted by Source and Phase)",
|
||||
x = "week_index",
|
||||
y = "Priority_Score",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
library(ggplot2)
|
||||
|
||||
ggplot(main_df, aes(y = new_priority_score, x = resolution_outcome, fill = isAuthorWMF)) +
|
||||
facet_grid(source~phase) +
|
||||
stat_histinterval()+
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Histogram of triaged priority scores by task outcome and affiliation (faceted by source and phase)",
|
||||
x = "on-time resolution (wide release date +90 days)",
|
||||
y = "priority score (post-triage)",
|
||||
fill = "isTaskAuthorWMF?"
|
||||
)
|
||||
|
||||
main_df %>%
|
||||
count(phase, source, resolution_outcome, isAuthorWMF) %>%
|
||||
ggplot(aes(
|
||||
y = resolution_outcome,
|
||||
x = isAuthorWMF,
|
||||
fill = n,
|
||||
label = n
|
||||
)) +
|
||||
facet_grid(source ~ phase) +
|
||||
geom_tile() +
|
||||
geom_text(size = 5, color = "white") +
|
||||
scale_fill_viridis_c() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Count of Tasks by on-time resolution",
|
||||
x = "isTaskAuthorWMF?",
|
||||
y = "on-time resolution (wide release date +90 days)",
|
||||
fill = "count of tasks"
|
||||
)
|
||||
|
||||
ggplot(main_df, aes(y = resolution_outcome, x = week_index, fill = isAuthorWMF)) +
|
||||
facet_grid(~source) +
|
||||
stat_dots(position = "dodgejust", quantiles = 100, color = NA) +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "centile (1/100) distribution dot plot of tasks (faceted by source)",
|
||||
x = "week_index of task filed",
|
||||
y = "on-time resolution (wide release date +90 days) ",
|
||||
fill = "isTaskAuthorWMF?"
|
||||
)
|
||||
|
||||
ggplot(main_df, aes(y = priority_score, x = week_index, color = resolution_outcome)) +
|
||||
facet_grid(~source) +
|
||||
geom_point() +
|
||||
geom_smooth() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "week_index x priority_score(faceted by source)",
|
||||
x = "week_index of task filed",
|
||||
y = "triaged priority score",
|
||||
color = "on-time resolution (wide release date +90 days)"
|
||||
)
|
||||
|
||||
ggplot(main_df, aes(y = priority_score, x = median_gerrit_reviewers, color = isAuthorWMF)) +
|
||||
facet_grid(~source) +
|
||||
geom_point() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "gerrit reviewers x priority_score(faceted by source)",
|
||||
x = "median # of gerrit reviewers for linked PRs",
|
||||
y = "triaged priority score",
|
||||
color = "isAuthorWMF?"
|
||||
)
|
||||
@ -31,12 +31,12 @@ human_result <- human_df %>%
|
||||
# for each task filer,
|
||||
# GET the proportion of Observed bug behavior + Expected Behavior
|
||||
# GET the proportion of Solution Discussion + Solution Usage
|
||||
main_csv <-"~/analysis_data/092925_unified_phab.csv"
|
||||
main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
closed_relevance_summary <- main_df %>%
|
||||
filter(comment_type == "task_description") %>%
|
||||
select(TaskPHID, closed_relevance, priority_score, source, phase, week_index)
|
||||
select(TaskPHID, resolution_outcome, priority_score, priority, source, phase, week_index, isAuthorWMF)
|
||||
|
||||
# TODO: need to get the TaskAuthor's comments, not just the big picture
|
||||
machine_result <- main_df %>%
|
||||
@ -51,9 +51,9 @@ machine_result <- main_df %>%
|
||||
n_comments = sum(comment_type == "task_subcomment")
|
||||
) |>
|
||||
left_join(closed_relevance_summary, by = "TaskPHID") |>
|
||||
mutate(dsl_score = ifelse(closed_relevance == "True", 1, 0))
|
||||
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0))
|
||||
|
||||
output_df <- machine_result |>
|
||||
left_join(human_result, by = "TaskPHID")
|
||||
|
||||
write.csv(output_df, "093025_power_dsl.csv", row.names = FALSE)
|
||||
write.csv(machine_result, "100725_bivariate_data.csv", row.names = FALSE)
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3441:37935 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: vOc4KzJoEbZDstjf4p5Q
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 29987861
|
||||
Loading…
Reference in New Issue
Block a user