adding trial survival test and more information about adac variables
This commit is contained in:
parent
ab1cb3efea
commit
2efd961fed
@ -4,9 +4,39 @@ library(tidyr)
|
|||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(purrr)
|
library(purrr)
|
||||||
|
|
||||||
unified_csv <-"~/analysis_data/102425_unified.csv"
|
unified_csv <-"~/analysis_data/102725_unified.csv"
|
||||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||||
|
|
||||||
|
|
||||||
|
unified_df |>
|
||||||
|
ggplot(
|
||||||
|
aes(
|
||||||
|
x=leng,
|
||||||
|
y=as.factor(isAuthorWMF)
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
|
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
|
||||||
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
|
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
|
||||||
|
|
||||||
@ -168,7 +198,7 @@ ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
|
|||||||
|
|
||||||
ggplot(second_join, aes(
|
ggplot(second_join, aes(
|
||||||
x = as.factor(comment_type), # x-axis grouping
|
x = as.factor(comment_type), # x-axis grouping
|
||||||
y = olmo_VR_prop,
|
y = modal_verbs,
|
||||||
fill = isAuthorWMF
|
fill = isAuthorWMF
|
||||||
)) +
|
)) +
|
||||||
ylim(0, 3) +
|
ylim(0, 3) +
|
||||||
|
|||||||
3130
dsl/102725_DSL_df_adac.csv
Normal file
3130
dsl/102725_DSL_df_adac.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -81,9 +81,34 @@ human_list_unified_df <- unified_df %>%
|
|||||||
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
|
# ADAC==0 proportions
|
||||||
|
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
||||||
|
human_BE_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_SOL_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_VR_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_BI_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
.groups = "drop"
|
.groups = "drop"
|
||||||
) |>
|
) |>
|
||||||
select(-n_tags, -n_tags_adac)
|
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
||||||
|
|
||||||
|
|
||||||
olmo_list_unified_df <- unified_df %>%
|
olmo_list_unified_df <- unified_df %>%
|
||||||
@ -156,9 +181,33 @@ olmo_list_unified_df <- unified_df %>%
|
|||||||
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
|
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
||||||
|
olmo_BE_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_SOL_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_VR_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_BI_prop_no_adac = if_else(
|
||||||
|
n_tags_no_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
.groups = "drop"
|
.groups = "drop"
|
||||||
) |>
|
) |>
|
||||||
select(-n_tags, -n_tags_adac)
|
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
||||||
|
|
||||||
# aggregate other Task-level variables and then join
|
# aggregate other Task-level variables and then join
|
||||||
task_level_variables <- unified_df |>
|
task_level_variables <- unified_df |>
|
||||||
@ -166,7 +215,14 @@ task_level_variables <- unified_df |>
|
|||||||
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||||
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||||
median_PC3 = median(PC3),
|
median_PC3 = median(PC3),
|
||||||
median_PC3_ADAC = median(PC3[ADAC==1])
|
median_PC3_adac = median(PC3[ADAC==1]),
|
||||||
|
median_PC3_no_adac = median(PC3[ADAC==0]),
|
||||||
|
median_PC1 = median(PC1),
|
||||||
|
median_PC1_adac = median(PC1[ADAC==1]),
|
||||||
|
median_PC1_no_adac = median(PC1[ADAC==0]),
|
||||||
|
median_PC4 = median(PC4),
|
||||||
|
median_PC4_adac = median(PC4[ADAC==1]),
|
||||||
|
median_PC4_no_adac = median(PC4[ADAC==0]),
|
||||||
)
|
)
|
||||||
|
|
||||||
descriptions <- unified_df |>
|
descriptions <- unified_df |>
|
||||||
@ -247,4 +303,4 @@ ggplot(task_level_variables, aes(
|
|||||||
y = "Time to Resolution (up to 60 days)",
|
y = "Time to Resolution (up to 60 days)",
|
||||||
)
|
)
|
||||||
# 4. save
|
# 4. save
|
||||||
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
|
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
|
||||||
|
|||||||
28
dsl/survival.R
Normal file
28
dsl/survival.R
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||||
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
||||||
|
dsl_df <- dsl_df |>
|
||||||
|
filter(source == "c1")
|
||||||
|
|
||||||
|
library(survival)
|
||||||
|
library(broom)
|
||||||
|
dsl_df$ttr_weeks <- dsl_df$TTR / 168
|
||||||
|
trial.survival <- Surv(dsl_df$ttr_weeks)
|
||||||
|
trial.model <- coxph(trial.survival ~ isAuthorWMF +
|
||||||
|
median_PC3_adac + week_index +
|
||||||
|
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||||
|
olmo_BI_prop_adac, data=dsl_df)
|
||||||
|
summary(trial.model)
|
||||||
|
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
||||||
|
|
||||||
|
ggplot(trial.tab,
|
||||||
|
aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) +
|
||||||
|
geom_pointrange() + # plots center point (x) and range (xmin, xmax)
|
||||||
|
geom_vline(xintercept=1, color="red") + # vertical line at HR=1
|
||||||
|
labs(x="hazard ratio", title="Hazard ratios and 95% CIs") +
|
||||||
|
theme_classic()
|
||||||
|
|
||||||
|
surv.at.means <- survfit(trial.model)
|
||||||
|
plot(surv.at.means, xlab="weeks", ylab="survival probability")
|
||||||
41
p2/quest/adac_analysis.R
Normal file
41
p2/quest/adac_analysis.R
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
main_csv <- "~/dsl/102725_DSL_df_adac.csv"
|
||||||
|
main_df <- read.csv(main_csv , header = TRUE)
|
||||||
|
|
||||||
|
main_df <- main_df |>
|
||||||
|
mutate(
|
||||||
|
pc_adac_delta = median_PC4_no_adac - median_PC4_adac,
|
||||||
|
olmo_BI_adac_delta = olmo_BI_prop_no_adac - olmo_BI_prop_adac
|
||||||
|
)
|
||||||
|
|
||||||
|
ggplot(main_df, aes(
|
||||||
|
x = as.factor(phase), # x-axis grouping
|
||||||
|
y = olmo_BI_adac_delta,
|
||||||
|
fill = resolution_outcome
|
||||||
|
)) +
|
||||||
|
ylim(-3, 3) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Boxplot of PC4",
|
||||||
|
x = "Comment_type",
|
||||||
|
y = "PC4",
|
||||||
|
fill = "isAuthorWMF?"
|
||||||
|
)
|
||||||
|
|
||||||
|
ggplot(main_df, aes(x = week_index,
|
||||||
|
y = median_PC3_adac, fill = resolution_outcome)) +
|
||||||
|
facet_grid(~source, scales="fixed") +
|
||||||
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||||
|
x = "PC4",
|
||||||
|
y = "PC3",
|
||||||
|
)
|
||||||
|
|
||||||
|
lm(main_df$human_BE_prop ~ main_df$median_PC1)
|
||||||
@ -1,17 +1,88 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
||||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
||||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
||||||
pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
main_csv <- "~/analysis_data/102725_unified.csv"
|
||||||
main_df <- read.csv(main_csv , header = TRUE)
|
main_df <- read.csv(main_csv , header = TRUE)
|
||||||
|
|
||||||
|
main_df <- main_df |>
|
||||||
|
mutate(
|
||||||
|
comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
description_df <- main_df |>
|
||||||
|
filter(comment_type == "task_description")
|
||||||
|
|
||||||
|
replies_df <- main_df |>
|
||||||
|
filter(comment_type == "task_subcomment") |>
|
||||||
|
filter(isGerritBot != TRUE) |>
|
||||||
|
left_join(
|
||||||
|
description_df,
|
||||||
|
by="TaskPHID"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
|
||||||
|
facet_grid(source~phase, scales="fixed") +
|
||||||
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||||
|
xlim(-30, 30) +
|
||||||
|
ylim(-30, 30) +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||||
|
x = "PC4",
|
||||||
|
y = "PC3",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
replies_df |>
|
||||||
|
ggplot(aes(
|
||||||
|
x = as.factor(author_closer.y), # x-axis grouping
|
||||||
|
y = PC1.x,
|
||||||
|
fill = reso
|
||||||
|
)) +
|
||||||
|
ylim(-30, 30) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
facet_grid(. ~ source.x, scales = "fixed") +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Boxplot of PC4",
|
||||||
|
x = "Comment_type",
|
||||||
|
y = "PC4",
|
||||||
|
fill = "isAuthorWMF?"
|
||||||
|
)
|
||||||
|
|
||||||
|
description_df |>
|
||||||
|
ggplot(aes(
|
||||||
|
x = as.factor(author_closer), # x-axis grouping
|
||||||
|
y = PC4,
|
||||||
|
fill = resolution_outcome
|
||||||
|
)) +
|
||||||
|
facet_grid( ~ source, scales = "fixed") +
|
||||||
|
ylim(-40, 40) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Boxplot of PC4",
|
||||||
|
x = "Comment_type",
|
||||||
|
y = "PC4",
|
||||||
|
fill = "isAuthorWMF?"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
main_df <- main_df |>
|
main_df <- main_df |>
|
||||||
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
||||||
# Join main_df to neurobiber_description_pca_df
|
# Join main_df to neurobiber_description_pca_df
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user