1
0

updating for new bivariate plots

This commit is contained in:
Matthew Gaughan 2025-11-03 10:04:42 -08:00
parent 2efd961fed
commit 6f2858dd72
6 changed files with 136 additions and 5 deletions

View File

@ -15,7 +15,7 @@ library(ggdist)
ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) + ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) +
facet_grid(~source) + facet_grid(~source) +
geom_dots(side = "both", layout = "hex", stackratio = 0.92) + geom_dots(side= "both", layout = "hex", stackratio = 0.92) +
scale_fill_viridis_d() + scale_fill_viridis_d() +
xlim(-130, 15) + xlim(-130, 15) +
theme_minimal() + theme_minimal() +

View File

@ -7,6 +7,19 @@ library(purrr)
unified_csv <-"~/analysis_data/102725_unified.csv" unified_csv <-"~/analysis_data/102725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE) unified_df <- read.csv(unified_csv, header = TRUE)
unified_df <- unified_df |> mutate(has_update = !is.na(olmo_sentence_labels) &
grepl("(ACTION ON ISSUE|TASK PROGRESS)", as.character(olmo_sentence_labels),
ignore.case = TRUE),
has_update = ifelse(
has_update, 1, 0
))
model <- glm(has_update ~ PC1 + PC2 + PC3 + PC4 + modal_verbs,
family = binomial(link = "logit"),
data = unified_df)
summary(model)
unified_df |> unified_df |>
ggplot( ggplot(

56
dsl/dsl.R Normal file
View File

@ -0,0 +1,56 @@
library(tidyverse)
library(dsl)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
power_model <- power_dsl(
labeled_size = c(100, 200, 300, 600, 1000),
model = "logit",
formula = dsl_score ~ human_SOL_prop_adac +
median_gerrit_loc_delta + median_gerrit_reviewers +
as.factor(isAuthorWMF) +
as.factor(source) +
median_PC3_adac +
week_index,
predicted_var = "human_SOL_prop_adac",
prediction = "olmo_SOL_prop_adac",
sample_prob = "sampling_prob",
data=dsl_df
)
summary(power_model)
plot(power_model, coef_name = "human_SOL_prop_adac")
dsl_df <- dsl_df |>
filter(source=="c1")
trial_model <- dsl(
model = "logit",
formula = dsl_score ~ human_BI_prop_adac +
median_gerrit_loc_delta + median_gerrit_reviewers +
as.factor(isAuthorWMF) +
as.factor(author_closer) +
median_PC4_adac +
week_index,
predicted_var = "human_BI_prop_adac",
prediction = "olmo_BI_prop_adac",
sample_prob = "sampling_prob",
data=dsl_df
)
summary(trial_model)
style_model <- dsl(
model = "lm",
formula = human_BE_prop ~
median_PC1 + median_PC4 +
as.factor(isAuthorWMF) +
as.factor(author_closer) +
median_PC3 +
week_index,
predicted_var = "human_BE_prop",
prediction = "olmo_BE_prop",
sample_prob = "sampling_prob",
data=dsl_df
)
summary(style_model)

45
dsl/final_bivariate.R Normal file
View File

@ -0,0 +1,45 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
outcome_summary <- dsl_df |>
group_by(source, isAuthorWMF)|>
summarise(
total_sum = sum(!is.na(resolution_outcome)),
count_resolution_outcome = sum(resolution_outcome),
success_prop = count_resolution_outcome / total_sum,
median_ttr_days = median(TTR, na.rm = TRUE) / 24
)
library(ggplot2)
library(ggdist)
signed_power <- function(x, p) {
sign(x) * abs(x) ^ p
}
signed_log <- function(x) sign(x) * log1p(abs(x))
dsl_df <- dsl_df |>
mutate(
sp_med_pc3_adac = signed_power(median_PC3_adac, 0.2),
sp_med_pc4_adac = signed_power(median_PC4_adac, 0.2),
sl_med_pc4_adac = signed_log(median_PC4_adac),
sl_med_pc3_adac = signed_log(median_PC3_adac)
)
ggplot(dsl_df, aes(
y= log1p(TTR/24),
x=sl_med_pc4_adac,
shape=isAuthorWMF,
color=isAuthorWMF
)) +
facet_grid(~source) +
theme_minimal() +
geom_smooth(method="loess", span=0.5) +
geom_point() +
scale_color_viridis_d()

View File

@ -3,8 +3,6 @@ library(tidyverse)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv" dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- read.csv(dsl_csv, header = TRUE)
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
dsl_df <- dsl_df |>
filter(source == "c1")
library(survival) library(survival)
library(broom) library(broom)
@ -12,8 +10,8 @@ dsl_df$ttr_weeks <- dsl_df$TTR / 168
trial.survival <- Surv(dsl_df$ttr_weeks) trial.survival <- Surv(dsl_df$ttr_weeks)
trial.model <- coxph(trial.survival ~ isAuthorWMF + trial.model <- coxph(trial.survival ~ isAuthorWMF +
median_PC3_adac + week_index + median_PC3_adac + week_index +
median_gerrit_loc_delta + median_gerrit_reviewers + median_gerrit_loc_delta + median_gerrit_reviewers + source +
olmo_BI_prop_adac, data=dsl_df) phase + author_closer, data=dsl_df)
summary(trial.model) summary(trial.model)
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T) trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
@ -26,3 +24,5 @@ ggplot(trial.tab,
surv.at.means <- survfit(trial.model) surv.at.means <- survfit(trial.model)
plot(surv.at.means, xlab="weeks", ylab="survival probability") plot(surv.at.means, xlab="weeks", ylab="survival probability")
#https://hbiostat.org/stat/binarysurv
plot(surv.at.means, xlab = "weeks", ylab = "survival probability", xlim = c(0, 26))

View File

@ -0,0 +1,17 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:51687 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: JZoMumQwFbIc7EUcaGbK
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 30651103