1
0

adding trial survival test and more information about adac variables

This commit is contained in:
Matthew Gaughan 2025-10-27 17:54:14 -07:00
parent ab1cb3efea
commit 2efd961fed
7 changed files with 3369 additions and 13 deletions

BIN
.RData

Binary file not shown.

View File

@ -4,9 +4,39 @@ library(tidyr)
library(dplyr)
library(purrr)
unified_csv <-"~/analysis_data/102425_unified.csv"
unified_csv <-"~/analysis_data/102725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
unified_df |>
ggplot(
aes(
x=leng,
y=as.factor(isAuthorWMF)
)
) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal()
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
@ -168,7 +198,7 @@ ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
ggplot(second_join, aes(
x = as.factor(comment_type), # x-axis grouping
y = olmo_VR_prop,
y = modal_verbs,
fill = isAuthorWMF
)) +
ylim(0, 3) +

3130
dsl/102725_DSL_df_adac.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -81,9 +81,34 @@ human_list_unified_df <- unified_df %>%
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
# ADAC==0 proportions
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
human_BE_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
human_SOL_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
human_VR_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
human_BI_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags, -n_tags_adac)
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
olmo_list_unified_df <- unified_df %>%
@ -156,9 +181,33 @@ olmo_list_unified_df <- unified_df %>%
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
olmo_BE_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
olmo_SOL_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
olmo_VR_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
olmo_BI_prop_no_adac = if_else(
n_tags_no_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags, -n_tags_adac)
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
# aggregate other Task-level variables and then join
task_level_variables <- unified_df |>
@ -166,7 +215,14 @@ task_level_variables <- unified_df |>
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
median_PC3 = median(PC3),
median_PC3_ADAC = median(PC3[ADAC==1])
median_PC3_adac = median(PC3[ADAC==1]),
median_PC3_no_adac = median(PC3[ADAC==0]),
median_PC1 = median(PC1),
median_PC1_adac = median(PC1[ADAC==1]),
median_PC1_no_adac = median(PC1[ADAC==0]),
median_PC4 = median(PC4),
median_PC4_adac = median(PC4[ADAC==1]),
median_PC4_no_adac = median(PC4[ADAC==0]),
)
descriptions <- unified_df |>
@ -247,4 +303,4 @@ ggplot(task_level_variables, aes(
y = "Time to Resolution (up to 60 days)",
)
# 4. save
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)

28
dsl/survival.R Normal file
View File

@ -0,0 +1,28 @@
library(tidyverse)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
dsl_df <- dsl_df |>
filter(source == "c1")
library(survival)
library(broom)
dsl_df$ttr_weeks <- dsl_df$TTR / 168
trial.survival <- Surv(dsl_df$ttr_weeks)
trial.model <- coxph(trial.survival ~ isAuthorWMF +
median_PC3_adac + week_index +
median_gerrit_loc_delta + median_gerrit_reviewers +
olmo_BI_prop_adac, data=dsl_df)
summary(trial.model)
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
ggplot(trial.tab,
aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) +
geom_pointrange() + # plots center point (x) and range (xmin, xmax)
geom_vline(xintercept=1, color="red") + # vertical line at HR=1
labs(x="hazard ratio", title="Hazard ratios and 95% CIs") +
theme_classic()
surv.at.means <- survfit(trial.model)
plot(surv.at.means, xlab="weeks", ylab="survival probability")

41
p2/quest/adac_analysis.R Normal file
View File

@ -0,0 +1,41 @@
library(tidyverse)
main_csv <- "~/dsl/102725_DSL_df_adac.csv"
main_df <- read.csv(main_csv , header = TRUE)
main_df <- main_df |>
mutate(
pc_adac_delta = median_PC4_no_adac - median_PC4_adac,
olmo_BI_adac_delta = olmo_BI_prop_no_adac - olmo_BI_prop_adac
)
ggplot(main_df, aes(
x = as.factor(phase), # x-axis grouping
y = olmo_BI_adac_delta,
fill = resolution_outcome
)) +
ylim(-3, 3) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
ggplot(main_df, aes(x = week_index,
y = median_PC3_adac, fill = resolution_outcome)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source and phase)",
x = "PC4",
y = "PC3",
)
lm(main_df$human_BE_prop ~ main_df$median_PC1)

View File

@ -1,17 +1,88 @@
library(tidyverse)
library(dplyr)
neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_csv <- "~/analysis_data/102725_unified.csv"
main_df <- read.csv(main_csv , header = TRUE)
main_df <- main_df |>
mutate(
comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
)
description_df <- main_df |>
filter(comment_type == "task_description")
replies_df <- main_df |>
filter(comment_type == "task_subcomment") |>
filter(isGerritBot != TRUE) |>
left_join(
description_df,
by="TaskPHID"
)
ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
facet_grid(source~phase, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-30, 30) +
ylim(-30, 30) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source and phase)",
x = "PC4",
y = "PC3",
)
replies_df |>
ggplot(aes(
x = as.factor(author_closer.y), # x-axis grouping
y = PC1.x,
fill = reso
)) +
ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source.x, scales = "fixed") +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
description_df |>
ggplot(aes(
x = as.factor(author_closer), # x-axis grouping
y = PC4,
fill = resolution_outcome
)) +
facet_grid( ~ source, scales = "fixed") +
ylim(-40, 40) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
main_df <- main_df |>
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
# Join main_df to neurobiber_description_pca_df