adding trial survival test and more information about adac variables

2025-10-27 17:54:14 -07:00 · 2025-10-27 17:54:14 -07:00 · 2efd961fed
commit 2efd961fed
parent ab1cb3efea
7 changed files with 3369 additions and 13 deletions
--- a/.RData
+++ b/.RData
--- a/analysis_data/style_dict_variables.R
+++ b/analysis_data/style_dict_variables.R
@ -4,9 +4,39 @@ library(tidyr)
 library(dplyr)
 library(purrr)
-unified_csv <-"~/analysis_data/102425_unified.csv"
+unified_csv <-"~/analysis_data/102725_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 
 unified_df |>
  ggplot(
    aes(
      x=leng,
      y=as.factor(isAuthorWMF)
    )
  ) + 
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() 
 BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
 SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
@ -168,7 +198,7 @@ ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
 ggplot(second_join, aes(
  x = as.factor(comment_type),    # x-axis grouping
-  y = olmo_VR_prop,
+  y = modal_verbs,
  fill = isAuthorWMF
 )) +
  ylim(0, 3) +
--- a/dsl/102725_DSL_df_adac.csv
+++ b/dsl/102725_DSL_df_adac.csv
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -81,9 +81,34 @@ human_list_unified_df <- unified_df %>%
      mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    # ADAC==0 proportions
    n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
    human_BE_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
-  select(-n_tags, -n_tags_adac)
+  select(-n_tags, -n_tags_adac, -n_tags_no_adac)
 olmo_list_unified_df <- unified_df %>%
@ -156,9 +181,33 @@ olmo_list_unified_df <- unified_df %>%
      mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
    olmo_BE_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                        "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION", 
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
-  select(-n_tags, -n_tags_adac)
+  select(-n_tags, -n_tags_adac, -n_tags_no_adac)
 # aggregate other Task-level variables and then join
 task_level_variables <- unified_df |>
@ -166,7 +215,14 @@ task_level_variables <- unified_df |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
            median_PC3 = median(PC3),
-            median_PC3_ADAC = median(PC3[ADAC==1])
+            median_PC3_adac = median(PC3[ADAC==1]),
            median_PC3_no_adac = median(PC3[ADAC==0]),
            median_PC1 = median(PC1),
            median_PC1_adac = median(PC1[ADAC==1]),
            median_PC1_no_adac = median(PC1[ADAC==0]),
            median_PC4 = median(PC4),
            median_PC4_adac = median(PC4[ADAC==1]),
            median_PC4_no_adac = median(PC4[ADAC==0]),
            )
 descriptions <- unified_df |>
@ -247,4 +303,4 @@ ggplot(task_level_variables, aes(
    y = "Time to Resolution (up to 60 days)",
  )
 # 4. save
-write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
+write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
--- a/dsl/survival.R
+++ b/dsl/survival.R
@ -0,0 +1,28 @@
 library(tidyverse)
 dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
 dsl_df <- dsl_df |>
  filter(source == "c1")
 library(survival)
 library(broom)
 dsl_df$ttr_weeks <- dsl_df$TTR / 168
 trial.survival <- Surv(dsl_df$ttr_weeks)
 trial.model <- coxph(trial.survival ~ isAuthorWMF + 
                       median_PC3_adac + week_index + 
                       median_gerrit_loc_delta + median_gerrit_reviewers +
                       olmo_BI_prop_adac, data=dsl_df)
 summary(trial.model)
 trial.tab <- tidy(trial.model,  exponentiate=T, conf.int=T)
 ggplot(trial.tab, 
       aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
  geom_pointrange() +  # plots center point (x) and range (xmin, xmax)
  geom_vline(xintercept=1, color="red") + # vertical line at HR=1
  labs(x="hazard ratio", title="Hazard ratios and 95% CIs") +
  theme_classic()
 surv.at.means <- survfit(trial.model)
 plot(surv.at.means, xlab="weeks", ylab="survival probability")
--- a/p2/quest/adac_analysis.R
+++ b/p2/quest/adac_analysis.R
@ -0,0 +1,41 @@
 library(tidyverse)
 main_csv <- "~/dsl/102725_DSL_df_adac.csv"
 main_df <- read.csv(main_csv  , header = TRUE)
 main_df <- main_df |>
  mutate(
    pc_adac_delta = median_PC4_no_adac - median_PC4_adac,
    olmo_BI_adac_delta = olmo_BI_prop_no_adac - olmo_BI_prop_adac
  )
 ggplot(main_df, aes(
  x = as.factor(phase),    # x-axis grouping
  y = olmo_BI_adac_delta,
  fill = resolution_outcome
 )) +
  ylim(-3, 3) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )
 ggplot(main_df, aes(x = week_index, 
                    y = median_PC3_adac, fill = resolution_outcome)) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "PCs for Task Comments (Faceted by source and phase)",
    x = "PC4",
    y = "PC3",
  )
 lm(main_df$human_BE_prop ~ main_df$median_PC1)
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,17 +1,88 @@
 library(tidyverse)
 library(dplyr)
-neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
+#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
-neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)
+#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)
-neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
+#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
-neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)
+#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)
-pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
+#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
-pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)
+#pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)
-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_csv <- "~/analysis_data/102725_unified.csv"
 main_df <- read.csv(main_csv  , header = TRUE)
 main_df <- main_df |>
  mutate(
    comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
  )
 description_df <- main_df |>
  filter(comment_type == "task_description")
 replies_df <- main_df |>
  filter(comment_type == "task_subcomment") |>
  filter(isGerritBot != TRUE) |>
  left_join(
    description_df,
    by="TaskPHID"
  )
 ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
  facet_grid(source~phase, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-30, 30) + 
  ylim(-30, 30) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "PCs for Task Comments (Faceted by source and phase)",
    x = "PC4",
    y = "PC3",
  )
 replies_df |>
  ggplot(aes(
    x = as.factor(author_closer.y),    # x-axis grouping
    y = PC1.x,
    fill = reso
  )) +
  ylim(-30, 30) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source.x, scales = "fixed") + 
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )
 description_df |>
  ggplot(aes(
    x = as.factor(author_closer),    # x-axis grouping
    y = PC4,
    fill = resolution_outcome
  )) +
  facet_grid(  ~ source, scales = "fixed") +
  ylim(-40, 40) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )
 main_df <- main_df |>
  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
 # Join main_df to neurobiber_description_pca_df