updating for new bivariate plots
This commit is contained in:
parent
2efd961fed
commit
6f2858dd72
@ -7,6 +7,19 @@ library(purrr)
|
|||||||
unified_csv <-"~/analysis_data/102725_unified.csv"
|
unified_csv <-"~/analysis_data/102725_unified.csv"
|
||||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||||
|
|
||||||
|
unified_df <- unified_df |> mutate(has_update = !is.na(olmo_sentence_labels) &
|
||||||
|
grepl("(ACTION ON ISSUE|TASK PROGRESS)", as.character(olmo_sentence_labels),
|
||||||
|
ignore.case = TRUE),
|
||||||
|
has_update = ifelse(
|
||||||
|
has_update, 1, 0
|
||||||
|
))
|
||||||
|
|
||||||
|
model <- glm(has_update ~ PC1 + PC2 + PC3 + PC4 + modal_verbs,
|
||||||
|
family = binomial(link = "logit"),
|
||||||
|
data = unified_df)
|
||||||
|
summary(model)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
unified_df |>
|
unified_df |>
|
||||||
ggplot(
|
ggplot(
|
||||||
|
|||||||
56
dsl/dsl.R
Normal file
56
dsl/dsl.R
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(dsl)
|
||||||
|
|
||||||
|
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||||
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
|
||||||
|
|
||||||
|
power_model <- power_dsl(
|
||||||
|
labeled_size = c(100, 200, 300, 600, 1000),
|
||||||
|
model = "logit",
|
||||||
|
formula = dsl_score ~ human_SOL_prop_adac +
|
||||||
|
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||||
|
as.factor(isAuthorWMF) +
|
||||||
|
as.factor(source) +
|
||||||
|
median_PC3_adac +
|
||||||
|
week_index,
|
||||||
|
predicted_var = "human_SOL_prop_adac",
|
||||||
|
prediction = "olmo_SOL_prop_adac",
|
||||||
|
sample_prob = "sampling_prob",
|
||||||
|
data=dsl_df
|
||||||
|
)
|
||||||
|
summary(power_model)
|
||||||
|
plot(power_model, coef_name = "human_SOL_prop_adac")
|
||||||
|
|
||||||
|
dsl_df <- dsl_df |>
|
||||||
|
filter(source=="c1")
|
||||||
|
|
||||||
|
trial_model <- dsl(
|
||||||
|
model = "logit",
|
||||||
|
formula = dsl_score ~ human_BI_prop_adac +
|
||||||
|
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||||
|
as.factor(isAuthorWMF) +
|
||||||
|
as.factor(author_closer) +
|
||||||
|
median_PC4_adac +
|
||||||
|
week_index,
|
||||||
|
predicted_var = "human_BI_prop_adac",
|
||||||
|
prediction = "olmo_BI_prop_adac",
|
||||||
|
sample_prob = "sampling_prob",
|
||||||
|
data=dsl_df
|
||||||
|
)
|
||||||
|
summary(trial_model)
|
||||||
|
|
||||||
|
style_model <- dsl(
|
||||||
|
model = "lm",
|
||||||
|
formula = human_BE_prop ~
|
||||||
|
median_PC1 + median_PC4 +
|
||||||
|
as.factor(isAuthorWMF) +
|
||||||
|
as.factor(author_closer) +
|
||||||
|
median_PC3 +
|
||||||
|
week_index,
|
||||||
|
predicted_var = "human_BE_prop",
|
||||||
|
prediction = "olmo_BE_prop",
|
||||||
|
sample_prob = "sampling_prob",
|
||||||
|
data=dsl_df
|
||||||
|
)
|
||||||
|
summary(style_model)
|
||||||
45
dsl/final_bivariate.R
Normal file
45
dsl/final_bivariate.R
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
#library(dsl)
|
||||||
|
library(dplyr)
|
||||||
|
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||||
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
|
||||||
|
outcome_summary <- dsl_df |>
|
||||||
|
group_by(source, isAuthorWMF)|>
|
||||||
|
summarise(
|
||||||
|
total_sum = sum(!is.na(resolution_outcome)),
|
||||||
|
count_resolution_outcome = sum(resolution_outcome),
|
||||||
|
success_prop = count_resolution_outcome / total_sum,
|
||||||
|
median_ttr_days = median(TTR, na.rm = TRUE) / 24
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
library(ggplot2)
|
||||||
|
library(ggdist)
|
||||||
|
|
||||||
|
|
||||||
|
signed_power <- function(x, p) {
|
||||||
|
sign(x) * abs(x) ^ p
|
||||||
|
}
|
||||||
|
|
||||||
|
signed_log <- function(x) sign(x) * log1p(abs(x))
|
||||||
|
dsl_df <- dsl_df |>
|
||||||
|
mutate(
|
||||||
|
sp_med_pc3_adac = signed_power(median_PC3_adac, 0.2),
|
||||||
|
sp_med_pc4_adac = signed_power(median_PC4_adac, 0.2),
|
||||||
|
sl_med_pc4_adac = signed_log(median_PC4_adac),
|
||||||
|
sl_med_pc3_adac = signed_log(median_PC3_adac)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ggplot(dsl_df, aes(
|
||||||
|
y= log1p(TTR/24),
|
||||||
|
x=sl_med_pc4_adac,
|
||||||
|
shape=isAuthorWMF,
|
||||||
|
color=isAuthorWMF
|
||||||
|
)) +
|
||||||
|
facet_grid(~source) +
|
||||||
|
theme_minimal() +
|
||||||
|
geom_smooth(method="loess", span=0.5) +
|
||||||
|
geom_point() +
|
||||||
|
scale_color_viridis_d()
|
||||||
@ -3,8 +3,6 @@ library(tidyverse)
|
|||||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
||||||
dsl_df <- dsl_df |>
|
|
||||||
filter(source == "c1")
|
|
||||||
|
|
||||||
library(survival)
|
library(survival)
|
||||||
library(broom)
|
library(broom)
|
||||||
@ -12,8 +10,8 @@ dsl_df$ttr_weeks <- dsl_df$TTR / 168
|
|||||||
trial.survival <- Surv(dsl_df$ttr_weeks)
|
trial.survival <- Surv(dsl_df$ttr_weeks)
|
||||||
trial.model <- coxph(trial.survival ~ isAuthorWMF +
|
trial.model <- coxph(trial.survival ~ isAuthorWMF +
|
||||||
median_PC3_adac + week_index +
|
median_PC3_adac + week_index +
|
||||||
median_gerrit_loc_delta + median_gerrit_reviewers +
|
median_gerrit_loc_delta + median_gerrit_reviewers + source +
|
||||||
olmo_BI_prop_adac, data=dsl_df)
|
phase + author_closer, data=dsl_df)
|
||||||
summary(trial.model)
|
summary(trial.model)
|
||||||
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
||||||
|
|
||||||
@ -26,3 +24,5 @@ ggplot(trial.tab,
|
|||||||
|
|
||||||
surv.at.means <- survfit(trial.model)
|
surv.at.means <- survfit(trial.model)
|
||||||
plot(surv.at.means, xlab="weeks", ylab="survival probability")
|
plot(surv.at.means, xlab="weeks", ylab="survival probability")
|
||||||
|
#https://hbiostat.org/stat/binarysurv
|
||||||
|
plot(surv.at.means, xlab = "weeks", ylab = "survival probability", xlim = c(0, 26))
|
||||||
|
|||||||
17
mgaughan-rstudio-server_30651103.out
Normal file
17
mgaughan-rstudio-server_30651103.out
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3439:51687 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: JZoMumQwFbIc7EUcaGbK
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 30651103
|
||||||
Loading…
Reference in New Issue
Block a user