library(tidyverse) main_csv <-"~/analysis_data/092925_unified_phab.csv" main_df <- read.csv(main_csv, header = TRUE) set.seed(123) # For reproducibility sampled_df <- main_df %>% group_by(source) %>% mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>% ungroup() %>% filter(sampled_TaskPHID) %>% select(-sampled_TaskPHID) sentence_level_sample <- sampled_df |> mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |> unnest(cleaned_sentences)|> filter(cleaned_sentences != ", ") |> select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit")) (nrow(sentence_level_sample) / 293) * 1.5 #write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)