69 lines
2.0 KiB
R
69 lines
2.0 KiB
R
library(tidyverse)
|
|
|
|
main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
human_csv <-"~/analysis_data/102025_human_labels.csv"
|
|
human_df <- read.csv(human_csv, header = TRUE)
|
|
selected_task_phids <- unique(human_df$TaskPHID)
|
|
|
|
set.seed(1893) # For reproducibility
|
|
|
|
# moment 1 of sampling
|
|
#sampled_df <- main_df %>%
|
|
# group_by(source) %>%
|
|
# mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
|
|
# ungroup() %>%
|
|
# filter(sampled_TaskPHID) %>%
|
|
# select(-sampled_TaskPHID)
|
|
|
|
# moment 2 of sampling
|
|
sampled_tasks <- main_df |>
|
|
filter(!(TaskPHID %in% selected_task_phids),
|
|
source=="c1",
|
|
phase=="3",
|
|
isAuthorWMF=="TRUE",
|
|
comment_type=="task_description")|>
|
|
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 10)) %>%
|
|
filter(sampled_TaskPHID) %>%
|
|
select(-sampled_TaskPHID)
|
|
|
|
sampled_task_phids <- unique(sampled_tasks$TaskPHID)
|
|
|
|
sampled_df <- main_df |>
|
|
filter(TaskPHID %in% sampled_task_phids)
|
|
|
|
#labeling_sampled_df <- sampled_df %>%
|
|
# group_by(source) %>%
|
|
# mutate(
|
|
# verification_sample = if_else(
|
|
# TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
|
|
# )
|
|
# ) %>%
|
|
# ungroup()
|
|
|
|
#sentence_level_sample <- labeling_sampled_df |>
|
|
sentence_level_sample <- sampled_df |>
|
|
mutate(
|
|
cleaned_sentences = lapply(
|
|
olmo_cleaned_sentences,
|
|
function(x) {
|
|
m <- str_match_all(x, "'([^']*)'|\"([^\"]*)\"")[[1]]
|
|
# Both columns 2 and 3 may exist, but only one will be filled for each match
|
|
vals <- c(m[,2], m[,3])
|
|
vals <- vals[vals != "" & !is.na(vals)]
|
|
return(vals)
|
|
}
|
|
)
|
|
)|>
|
|
unnest(cleaned_sentences)|>
|
|
filter(cleaned_sentences != ", ") |>
|
|
select(-olmo_sentence_categories, -starts_with("normalized"), -contains("gerrit"))
|
|
|
|
|
|
table(sentence_level_sample$verification_sample)
|
|
(nrow(sentence_level_sample) / 293) * 1.5
|
|
length(unique(sentence_level_sample$TaskPHID))
|
|
|
|
write.csv(sentence_level_sample, "102125_human_info_sample.csv", row.names = FALSE)
|