updating with new human labels

2025-10-21 15:19:13 -07:00 · 2025-10-21 15:19:13 -07:00 · 90311ca136
commit 90311ca136
parent b198781aa0
5 changed files with 63041 additions and 32 deletions
--- a/102125_human_info_sample.csv
+++ b/102125_human_info_sample.csv
--- a/analysis_data/102025_human_labels.csv
+++ b/analysis_data/102025_human_labels.csv
--- a/analysis_data/sampling_strat_check.R
+++ b/analysis_data/sampling_strat_check.R
@ -0,0 +1,49 @@
+library(tidyverse)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(purrr)
+
+pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv"
+pre_unified_df <- read.csv(pre_unified_csv, header = TRUE)  
+
+unified_task_df <- pre_unified_df |> filter(comment_type=="task_description")
+unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description")
+
+library(scales)
+
+unified_task_df |>
+  count(source, phase, isAuthorWMF) |>
+  mutate(pct = round(n / sum(n), 3)) 
+
+median_n <- unified_comment_df |>
+  count(AuthorPHID) |>
+  pull(n) |>
+  median(na.rm = TRUE)
+median_n
+
+median_comments <- pre_unified_df |>
+  count(TaskPHID) |>
+  pull(n) |>
+  median(na.rm=TRUE)
+median_comments
+
+human_csv <-"~/analysis_data/102025_human_labels.csv"
+human_df <- read.csv(human_csv, header = TRUE)  
+
+cleaned_human_df <- human_df |>
+  mutate(human_label = replace_na(human_label, "NA"),
+         human_label = str_squish(human_label)) |>
+  group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |>
+  summarise(human_labels = list(human_label), .groups = "drop")
+
+cleaned_human_df <- cleaned_human_df |>
+  left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID"))
+
+
+
+median_comments <- human_comment_df |>
+  count(AuthorPHID) |>
+  pull(n) |>
+  median(na.rm=TRUE)
+median_comments
--- a/dsl/human_sampling.R
+++ b/dsl/human_sampling.R
@ -1,27 +1,49 @@
 library(tidyverse)

-main_csv <-"~/analysis_data/100325_unified_phab.csv"
+main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
 main_df <- read.csv(main_csv, header = TRUE) 

+human_csv <-"~/analysis_data/102025_human_labels.csv"
+human_df <- read.csv(human_csv, header = TRUE)  
+selected_task_phids <- unique(human_df$TaskPHID)
+
 set.seed(1893) # For reproducibility

-sampled_df <- main_df %>%
-  group_by(source) %>%
-  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
-  ungroup() %>%
+# moment 1 of sampling 
+#sampled_df <- main_df %>%
+#  group_by(source) %>%
+#  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
+#  ungroup() %>%
+#  filter(sampled_TaskPHID) %>%
+#  select(-sampled_TaskPHID) 
+
+# moment 2 of sampling
+sampled_tasks <- main_df |>
+  filter(!(TaskPHID %in% selected_task_phids),
+         source=="c1",
+         phase=="3",
+         isAuthorWMF=="TRUE",
+         comment_type=="task_description")|>
+  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 10)) %>%
  filter(sampled_TaskPHID) %>%
  select(-sampled_TaskPHID) 

-labeling_sampled_df <- sampled_df %>%
-  group_by(source) %>%
-  mutate(
-    verification_sample = if_else(
-      TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
-    )
-  ) %>%
-  ungroup()
+sampled_task_phids <- unique(sampled_tasks$TaskPHID)
+
+sampled_df <- main_df |>
+  filter(TaskPHID %in% sampled_task_phids)
+
+#labeling_sampled_df <- sampled_df %>%
+#  group_by(source) %>%
+#  mutate(
+#    verification_sample = if_else(
+#      TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
+#    )
+#  ) %>%
+#  ungroup()
  
-sentence_level_sample <- labeling_sampled_df |>
+#sentence_level_sample <- labeling_sampled_df |>
+sentence_level_sample <- sampled_df |>
  mutate(
    cleaned_sentences = lapply(
      olmo_cleaned_sentences,
@ -43,4 +65,4 @@ table(sentence_level_sample$verification_sample)
 (nrow(sentence_level_sample) / 293) * 1.5
 length(unique(sentence_level_sample$TaskPHID))

-write.csv(sentence_level_sample, "100625_human_info_sample.csv", row.names = FALSE)
+write.csv(sentence_level_sample, "102125_human_info_sample.csv", row.names = FALSE)
--- a/mgaughan-rstudio-server_30181212.out
+++ b/mgaughan-rstudio-server_30181212.out
@ -1,17 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3439:57743 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: anx8V7R1X2rfcwUV20H/
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 30181212