1
0
mw-lifecycle-analysis/dsl/human_sampling.R
2025-10-06 09:37:06 -07:00

47 lines
1.3 KiB
R

library(tidyverse)
main_csv <-"~/analysis_data/100325_unified_phab.csv"
main_df <- read.csv(main_csv, header = TRUE)
set.seed(1893) # For reproducibility
sampled_df <- main_df %>%
group_by(source) %>%
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
ungroup() %>%
filter(sampled_TaskPHID) %>%
select(-sampled_TaskPHID)
labeling_sampled_df <- sampled_df %>%
group_by(source) %>%
mutate(
verification_sample = if_else(
TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
)
) %>%
ungroup()
sentence_level_sample <- labeling_sampled_df |>
mutate(
cleaned_sentences = lapply(
olmo_cleaned_sentences,
function(x) {
m <- str_match_all(x, "'([^']*)'|\"([^\"]*)\"")[[1]]
# Both columns 2 and 3 may exist, but only one will be filled for each match
vals <- c(m[,2], m[,3])
vals <- vals[vals != "" & !is.na(vals)]
return(vals)
}
)
)|>
unnest(cleaned_sentences)|>
filter(cleaned_sentences != ", ") |>
select(-olmo_sentence_categories, -starts_with("normalized"), -contains("gerrit"))
table(sentence_level_sample$verification_sample)
(nrow(sentence_level_sample) / 293) * 1.5
length(unique(sentence_level_sample$TaskPHID))
write.csv(sentence_level_sample, "100625_human_info_sample.csv", row.names = FALSE)