updating with DSL power analysis

2025-09-30 20:17:09 -07:00 · 2025-09-30 20:17:09 -07:00 · e61d3b6599
commit e61d3b6599
parent b7c2c9fcd6
5 changed files with 7244 additions and 0 deletions
--- a/dsl/092225_info_matt_labels.csv
+++ b/dsl/092225_info_matt_labels.csv
--- a/dsl/093025_power_dsl.csv
+++ b/dsl/093025_power_dsl.csv
--- a/dsl/dsl_data_transform.R
+++ b/dsl/dsl_data_transform.R
@ -0,0 +1,59 @@
+library(tidyverse)
+
+# load in the human labels and for each task filer, @ the task level
+# GET the proportion of Observed bug behavior  + Expected Behavior
+# GET the proportion of Solution Discussion  + Solution Usage 
+human_csv <-"~/dsl/092225_info_matt_labels.csv"
+human_df <- read.csv(human_csv, header = TRUE) 
+
+#task_authors <- human_df %>%
+#  filter(comment_type == "task_description") %>%
+#  select(TaskPHID, AuthorPHID) %>%
+#  rename(Task_AuthorPHID = AuthorPHID)
+
+#result <- task_authors %>%
+#  rowwise() %>%
+#  mutate(
+#    bug_prop = {
+#      rows_by_author <- human_df %>% filter(AuthorPHID == task_authorPHID)
+#      mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
+#    }
+#  ) %>%
+#  ungroup()
+human_result <- human_df %>%
+  group_by(TaskPHID) %>%
+  summarise(
+    human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
+    human_SOL_prop = mean(human_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"))
+  )
+
+# load in the existing df and @ the task level get 
+# for each task filer, 
+# GET the proportion of Observed bug behavior  + Expected Behavior
+# GET the proportion of Solution Discussion  + Solution Usage 
+main_csv <-"~/analysis_data/092925_unified_phab.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+closed_relevance_summary <- main_df %>%
+  filter(comment_type == "task_description") %>%
+  select(TaskPHID, closed_relevance, priority_score, source, phase, week_index)
+
+# TODO: need to get the TaskAuthor's comments, not just the big picture
+machine_result <- main_df %>%
+  mutate(olmo_label = str_extract_all(olmo_sentence_categories, "(?<=')[^']+(?=')")) |>
+  unnest(olmo_label) |>
+  group_by(TaskPHID) |>
+  summarise(
+    olmo_BE_prop = mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
+    olmo_SOL_prop = mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE")),
+    median_gerrit_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
+    median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
+    n_comments = sum(comment_type == "task_subcomment")
+  ) |>
+  left_join(closed_relevance_summary, by = "TaskPHID") |>
+  mutate(dsl_score = ifelse(closed_relevance == "True", 1, 0))
+
+output_df <- machine_result |>
+  left_join(human_result, by = "TaskPHID")
+
+write.csv(output_df, "093025_power_dsl.csv", row.names = FALSE)
--- a/dsl/dsl_power.R
+++ b/dsl/dsl_power.R
@ -0,0 +1,44 @@
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("naoki-egami/dsl", dependencies = TRUE)
+library(dsl)
+
+power_csv <-"~/dsl/093025_power_dsl.csv"
+power_df <- read.csv(power_csv, header = TRUE) 
+
+power_c1 <- power_df |>
+  filter(source=='c1') 
+
+power_c2 <- power_df |>
+  filter(source=='c2') 
+
+power_c3 <- power_df |>
+  filter(source=='c3') 
+
+power_model <- power_dsl(
+  labeled_size = c(100, 200, 300, 600, 1000),
+  model = "logit", 
+  formula = dsl_score ~ human_BE_prop + 
+    median_gerrit_delta + median_gerrit_reviewers + 
+    n_comments + 
+    priority_score +
+    week_index,
+  predicted_var = "human_BE_prop",
+  prediction = "olmo_BE_prop",
+  data=power_c1
+)
+
+summary(power_model)
+plot(power_model, coef_name = "human_BE_prop")
+
+trial_model <- dsl(
+  model = "logit", 
+  formula = dsl_score ~ human_BE_prop + 
+    median_gerrit_delta + median_gerrit_reviewers + 
+    n_comments + 
+    priority_score +
+    week_index,
+  predicted_var = "human_BE_prop",
+  prediction = "olmo_BE_prop",
+  data=power_df
+)
+summary(trial_model)
--- a/mgaughan-rstudio-server_29920945.out
+++ b/mgaughan-rstudio-server_29920945.out
@ -0,0 +1,17 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: eSK3QbcwgGpUya1wJIvC
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 29920945