adding trial survival test and more information about adac variables

2025-10-27 17:54:14 -07:00 · 2025-10-27 17:54:14 -07:00 · 2efd961fed
commit 2efd961fed
parent ab1cb3efea
7 changed files with 3369 additions and 13 deletions
--- a/.RData
+++ b/.RData
--- a/analysis_data/style_dict_variables.R
+++ b/analysis_data/style_dict_variables.R
@ -4,9 +4,39 @@ library(tidyr)
 library(dplyr)
 library(purrr)

-unified_csv <-"~/analysis_data/102425_unified.csv"
+unified_csv <-"~/analysis_data/102725_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 

+
+unified_df |>
+  ggplot(
+    aes(
+      x=leng,
+      y=as.factor(isAuthorWMF)
+    )
+  ) + 
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
+  scale_fill_viridis_d() +
+  theme_minimal() 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
 SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")

@ -168,7 +198,7 @@ ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +

 ggplot(second_join, aes(
  x = as.factor(comment_type),    # x-axis grouping
-  y = olmo_VR_prop,
+  y = modal_verbs,
  fill = isAuthorWMF
 )) +
  ylim(0, 3) +
--- a/dsl/102725_DSL_df_adac.csv
+++ b/dsl/102725_DSL_df_adac.csv
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -81,9 +81,34 @@ human_list_unified_df <- unified_df %>%
      mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
+    # ADAC==0 proportions
+    n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
+    human_BE_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    human_SOL_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    human_VR_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    human_BI_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION", 
+                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
    .groups = "drop"
  ) |>
-  select(-n_tags, -n_tags_adac)
+  select(-n_tags, -n_tags_adac, -n_tags_no_adac)


 olmo_list_unified_df <- unified_df %>%
@ -156,9 +181,33 @@ olmo_list_unified_df <- unified_df %>%
      mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
+    n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
+    olmo_BE_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    olmo_SOL_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    olmo_VR_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                        "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    olmo_BI_prop_no_adac = if_else(
+      n_tags_no_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION", 
+                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
    .groups = "drop"
  ) |>
-  select(-n_tags, -n_tags_adac)
+  select(-n_tags, -n_tags_adac, -n_tags_no_adac)

 # aggregate other Task-level variables and then join
 task_level_variables <- unified_df |>
@ -166,7 +215,14 @@ task_level_variables <- unified_df |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
            median_PC3 = median(PC3),
-            median_PC3_ADAC = median(PC3[ADAC==1])
+            median_PC3_adac = median(PC3[ADAC==1]),
+            median_PC3_no_adac = median(PC3[ADAC==0]),
+            median_PC1 = median(PC1),
+            median_PC1_adac = median(PC1[ADAC==1]),
+            median_PC1_no_adac = median(PC1[ADAC==0]),
+            median_PC4 = median(PC4),
+            median_PC4_adac = median(PC4[ADAC==1]),
+            median_PC4_no_adac = median(PC4[ADAC==0]),
            )

 descriptions <- unified_df |>
@ -247,4 +303,4 @@ ggplot(task_level_variables, aes(
    y = "Time to Resolution (up to 60 days)",
  )
 # 4. save
-write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
+write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
--- a/dsl/survival.R
+++ b/dsl/survival.R
@ -0,0 +1,28 @@
+library(tidyverse)
+
+dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
+dsl_df <- read.csv(dsl_csv, header = TRUE) 
+#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
+dsl_df <- dsl_df |>
+  filter(source == "c1")
+
+library(survival)
+library(broom)
+dsl_df$ttr_weeks <- dsl_df$TTR / 168
+trial.survival <- Surv(dsl_df$ttr_weeks)
+trial.model <- coxph(trial.survival ~ isAuthorWMF + 
+                       median_PC3_adac + week_index + 
+                       median_gerrit_loc_delta + median_gerrit_reviewers +
+                       olmo_BI_prop_adac, data=dsl_df)
+summary(trial.model)
+trial.tab <- tidy(trial.model,  exponentiate=T, conf.int=T)
+
+ggplot(trial.tab, 
+       aes(y=term, x=estimate, xmin=conf.low, xmax=conf.high)) + 
+  geom_pointrange() +  # plots center point (x) and range (xmin, xmax)
+  geom_vline(xintercept=1, color="red") + # vertical line at HR=1
+  labs(x="hazard ratio", title="Hazard ratios and 95% CIs") +
+  theme_classic()
+
+surv.at.means <- survfit(trial.model)
+plot(surv.at.means, xlab="weeks", ylab="survival probability")
--- a/p2/quest/adac_analysis.R
+++ b/p2/quest/adac_analysis.R
@ -0,0 +1,41 @@
+library(tidyverse)
+
+main_csv <- "~/dsl/102725_DSL_df_adac.csv"
+main_df <- read.csv(main_csv  , header = TRUE)
+
+main_df <- main_df |>
+  mutate(
+    pc_adac_delta = median_PC4_no_adac - median_PC4_adac,
+    olmo_BI_adac_delta = olmo_BI_prop_no_adac - olmo_BI_prop_adac
+  )
+
+ggplot(main_df, aes(
+  x = as.factor(phase),    # x-axis grouping
+  y = olmo_BI_adac_delta,
+  fill = resolution_outcome
+)) +
+  ylim(-3, 3) +
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Boxplot of PC4",
+    x = "Comment_type",
+    y = "PC4",
+    fill = "isAuthorWMF?"
+  )
+
+ggplot(main_df, aes(x = week_index, 
+                    y = median_PC3_adac, fill = resolution_outcome)) +
+  facet_grid(~source, scales="fixed") +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "PCs for Task Comments (Faceted by source and phase)",
+    x = "PC4",
+    y = "PC3",
+  )
+
+lm(main_df$human_BE_prop ~ main_df$median_PC1)
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,17 +1,88 @@
 library(tidyverse)
 library(dplyr)
-neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
-neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)
+#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
+#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)

-neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
-neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)
+#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
+#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)

-pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
-pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)
+#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
+#pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)

-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_csv <- "~/analysis_data/102725_unified.csv"
 main_df <- read.csv(main_csv  , header = TRUE)

+main_df <- main_df |>
+  mutate(
+    comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
+  )
+
+
+
+
+description_df <- main_df |>
+  filter(comment_type == "task_description")
+
+replies_df <- main_df |>
+  filter(comment_type == "task_subcomment") |>
+  filter(isGerritBot != TRUE) |>
+  left_join(
+    description_df,
+    by="TaskPHID"
+  )
+
+
+ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
+  facet_grid(source~phase, scales="fixed") +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  xlim(-30, 30) + 
+  ylim(-30, 30) +
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "PCs for Task Comments (Faceted by source and phase)",
+    x = "PC4",
+    y = "PC3",
+  )
+
+
+replies_df |>
+  ggplot(aes(
+    x = as.factor(author_closer.y),    # x-axis grouping
+    y = PC1.x,
+    fill = reso
+  )) +
+  ylim(-30, 30) +
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  facet_grid(. ~ source.x, scales = "fixed") + 
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Boxplot of PC4",
+    x = "Comment_type",
+    y = "PC4",
+    fill = "isAuthorWMF?"
+  )
+
+description_df |>
+  ggplot(aes(
+    x = as.factor(author_closer),    # x-axis grouping
+    y = PC4,
+    fill = resolution_outcome
+  )) +
+  facet_grid(  ~ source, scales = "fixed") +
+  ylim(-40, 40) +
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Boxplot of PC4",
+    x = "Comment_type",
+    y = "PC4",
+    fill = "isAuthorWMF?"
+  )
+
+
 main_df <- main_df |>
  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
 # Join main_df to neurobiber_description_pca_df