23 lines
764 B
R
23 lines
764 B
R
library(tidyverse)
|
|
|
|
main_csv <-"~/analysis_data/092925_unified_phab.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
set.seed(123) # For reproducibility
|
|
|
|
sampled_df <- main_df %>%
|
|
group_by(source) %>%
|
|
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
|
|
ungroup() %>%
|
|
filter(sampled_TaskPHID) %>%
|
|
select(-sampled_TaskPHID)
|
|
|
|
sentence_level_sample <- sampled_df |>
|
|
mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
|
|
unnest(cleaned_sentences)|>
|
|
filter(cleaned_sentences != ", ") |>
|
|
select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
|
|
|
|
(nrow(sentence_level_sample) / 293) * 1.5
|
|
|
|
#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE) |