pulling sample for human_labeling

2025-10-06 09:14:00 -07:00 · 2025-10-06 09:14:00 -07:00 · a14b08cfd8
commit a14b08cfd8
parent 83bcc15811
3 changed files with 59732 additions and 7 deletions
--- a/dsl/100625_human_info_sample.csv
+++ b/dsl/100625_human_info_sample.csv
--- a/dsl/human_sampling.R
+++ b/dsl/human_sampling.R
@ -1,23 +1,45 @@
 library(tidyverse)

-main_csv <-"~/analysis_data/092925_unified_phab.csv"
+main_csv <-"~/analysis_data/100325_unified_phab.csv"
 main_df <- read.csv(main_csv, header = TRUE) 

-set.seed(123) # For reproducibility
+set.seed(1871) # For reproducibility

 sampled_df <- main_df %>%
  group_by(source) %>%
-  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
+  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 35)) %>%
  ungroup() %>%
  filter(sampled_TaskPHID) %>%
  select(-sampled_TaskPHID) 

-sentence_level_sample <- sampled_df |>
-  mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
+labeling_sampled_df <- sampled_df %>%
+  group_by(source) %>%
+  mutate(
+    verification_sample = if_else(
+      TaskPHID %in% sample(unique(TaskPHID), min(7, length(unique(TaskPHID)))), 1L, 0L
+    )
+  ) %>%
+  ungroup()
+  
+sentence_level_sample <- labeling_sampled_df |>
+  mutate(
+    cleaned_sentences = lapply(
+      olmo_cleaned_sentences,
+      function(x) {
+        m <- str_match_all(x, "'([^']*)'|\"([^\"]*)\"")[[1]]
+        # Both columns 2 and 3 may exist, but only one will be filled for each match
+        vals <- c(m[,2], m[,3])
+        vals <- vals[vals != "" & !is.na(vals)]
+        return(vals)
+      }
+    )
+  )|>
  unnest(cleaned_sentences)|>
  filter(cleaned_sentences != ", ") |>
-  select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
+  select(-olmo_sentence_categories, -starts_with("normalized"), -contains("gerrit"))

+
+table(sentence_level_sample$verification_sample)
 (nrow(sentence_level_sample) / 293) * 1.5

-#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)
+write.csv(sentence_level_sample, "100625_human_info_sample.csv", row.names = FALSE)
--- a/mgaughan-rstudio-server_29985545.out
+++ b/mgaughan-rstudio-server_29985545.out
@ -0,0 +1,17 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:58519 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: Kyq6AGP0kRtXvXeoXReZ
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 29985545