adding trial survival test and more information about adac variables
This commit is contained in:
parent
ab1cb3efea
commit
2efd961fed
@ -4,9 +4,39 @@ library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
unified_csv <-"~/analysis_data/102425_unified.csv"
|
||||
unified_csv <-"~/analysis_data/102725_unified.csv"
|
||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
|
||||
|
||||
unified_df |>
|
||||
ggplot(
|
||||
aes(
|
||||
x=leng,
|
||||
y=as.factor(isAuthorWMF)
|
||||
)
|
||||
) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
|
||||
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
|
||||
|
||||
@ -168,7 +198,7 @@ ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
|
||||
|
||||
ggplot(second_join, aes(
|
||||
x = as.factor(comment_type), # x-axis grouping
|
||||
y = olmo_VR_prop,
|
||||
y = modal_verbs,
|
||||
fill = isAuthorWMF
|
||||
)) +
|
||||
ylim(0, 3) +
|
||||
|
||||
3130
dsl/102725_DSL_df_adac.csv
Normal file
3130
dsl/102725_DSL_df_adac.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -81,9 +81,34 @@ human_list_unified_df <- unified_df %>%
|
||||
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
# ADAC==0 proportions
|
||||
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
||||
human_BE_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags, -n_tags_adac)
|
||||
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
||||
|
||||
|
||||
olmo_list_unified_df <- unified_df %>%
|
||||
@ -156,9 +181,33 @@ olmo_list_unified_df <- unified_df %>%
|
||||
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
||||
olmo_BE_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags, -n_tags_adac)
|
||||
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
||||
|
||||
# aggregate other Task-level variables and then join
|
||||
task_level_variables <- unified_df |>
|
||||
@ -166,7 +215,14 @@ task_level_variables <- unified_df |>
|
||||
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||
median_PC3 = median(PC3),
|
||||
median_PC3_ADAC = median(PC3[ADAC==1])
|
||||
median_PC3_adac = median(PC3[ADAC==1]),
|
||||
median_PC3_no_adac = median(PC3[ADAC==0]),
|
||||
median_PC1 = median(PC1),
|
||||
median_PC1_adac = median(PC1[ADAC==1]),
|
||||
median_PC1_no_adac = median(PC1[ADAC==0]),
|
||||
median_PC4 = median(PC4),
|
||||
median_PC4_adac = median(PC4[ADAC==1]),
|
||||
median_PC4_no_adac = median(PC4[ADAC==0]),
|
||||
)
|
||||
|
||||
descriptions <- unified_df |>
|
||||
@ -247,4 +303,4 @@ ggplot(task_level_variables, aes(
|
||||
y = "Time to Resolution (up to 60 days)",
|
||||
)
|
||||
# 4. save
|
||||
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
|
||||
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
|
||||
|
||||
28
dsl/survival.R
Normal file
28
dsl/survival.R
Normal file
@ -0,0 +1,28 @@
|
||||
library(tidyverse)
|
||||
|
||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
||||
dsl_df <- dsl_df |>
|
||||
filter(source == "c1")
|
||||
|
||||
library(survival)
|
||||
library(broom)
|
||||
dsl_df$ttr_weeks <- dsl_df$TTR / 168
|
||||
trial.survival <- Surv(dsl_df$ttr_weeks)
|
||||
trial.model <- coxph(trial.survival ~ isAuthorWMF +
|
||||
median_PC3_adac + week_index +
|
||||
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||
olmo_BI_prop_adac, data=dsl_df)
|
||||
summary(trial.model)
|
||||
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
||||
|
||||
ggplot(trial.tab,
|
||||
aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) +
|
||||
geom_pointrange() + # plots center point (x) and range (xmin, xmax)
|
||||
geom_vline(xintercept=1, color="red") + # vertical line at HR=1
|
||||
labs(x="hazard ratio", title="Hazard ratios and 95% CIs") +
|
||||
theme_classic()
|
||||
|
||||
surv.at.means <- survfit(trial.model)
|
||||
plot(surv.at.means, xlab="weeks", ylab="survival probability")
|
||||
41
p2/quest/adac_analysis.R
Normal file
41
p2/quest/adac_analysis.R
Normal file
@ -0,0 +1,41 @@
|
||||
library(tidyverse)
|
||||
|
||||
main_csv <- "~/dsl/102725_DSL_df_adac.csv"
|
||||
main_df <- read.csv(main_csv , header = TRUE)
|
||||
|
||||
main_df <- main_df |>
|
||||
mutate(
|
||||
pc_adac_delta = median_PC4_no_adac - median_PC4_adac,
|
||||
olmo_BI_adac_delta = olmo_BI_prop_no_adac - olmo_BI_prop_adac
|
||||
)
|
||||
|
||||
ggplot(main_df, aes(
|
||||
x = as.factor(phase), # x-axis grouping
|
||||
y = olmo_BI_adac_delta,
|
||||
fill = resolution_outcome
|
||||
)) +
|
||||
ylim(-3, 3) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
ggplot(main_df, aes(x = week_index,
|
||||
y = median_PC3_adac, fill = resolution_outcome)) +
|
||||
facet_grid(~source, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||
x = "PC4",
|
||||
y = "PC3",
|
||||
)
|
||||
|
||||
lm(main_df$human_BE_prop ~ main_df$median_PC1)
|
||||
@ -1,17 +1,88 @@
|
||||
library(tidyverse)
|
||||
library(dplyr)
|
||||
neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
||||
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
||||
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
||||
pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
||||
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_csv <- "~/analysis_data/102725_unified.csv"
|
||||
main_df <- read.csv(main_csv , header = TRUE)
|
||||
|
||||
main_df <- main_df |>
|
||||
mutate(
|
||||
comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
description_df <- main_df |>
|
||||
filter(comment_type == "task_description")
|
||||
|
||||
replies_df <- main_df |>
|
||||
filter(comment_type == "task_subcomment") |>
|
||||
filter(isGerritBot != TRUE) |>
|
||||
left_join(
|
||||
description_df,
|
||||
by="TaskPHID"
|
||||
)
|
||||
|
||||
|
||||
ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
|
||||
facet_grid(source~phase, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
xlim(-30, 30) +
|
||||
ylim(-30, 30) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||
x = "PC4",
|
||||
y = "PC3",
|
||||
)
|
||||
|
||||
|
||||
replies_df |>
|
||||
ggplot(aes(
|
||||
x = as.factor(author_closer.y), # x-axis grouping
|
||||
y = PC1.x,
|
||||
fill = reso
|
||||
)) +
|
||||
ylim(-30, 30) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source.x, scales = "fixed") +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
description_df |>
|
||||
ggplot(aes(
|
||||
x = as.factor(author_closer), # x-axis grouping
|
||||
y = PC4,
|
||||
fill = resolution_outcome
|
||||
)) +
|
||||
facet_grid( ~ source, scales = "fixed") +
|
||||
ylim(-40, 40) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
|
||||
main_df <- main_df |>
|
||||
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
||||
# Join main_df to neurobiber_description_pca_df
|
||||
|
||||
Loading…
Reference in New Issue
Block a user