library(tidyverse) main_csv <-"~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) human_csv <-"~/analysis_data/102025_human_labels.csv" human_df <- read.csv(human_csv, header = TRUE) selected_task_phids <- unique(human_df$TaskPHID) set.seed(1893) # For reproducibility # moment 1 of sampling #sampled_df <- main_df %>% # group_by(source) %>% # mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>% # ungroup() %>% # filter(sampled_TaskPHID) %>% # select(-sampled_TaskPHID) # moment 2 of sampling sampled_tasks <- main_df |> filter(!(TaskPHID %in% selected_task_phids), source=="c1", phase=="3", isAuthorWMF=="TRUE", comment_type=="task_description")|> mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 10)) %>% filter(sampled_TaskPHID) %>% select(-sampled_TaskPHID) sampled_task_phids <- unique(sampled_tasks$TaskPHID) sampled_df <- main_df |> filter(TaskPHID %in% sampled_task_phids) #labeling_sampled_df <- sampled_df %>% # group_by(source) %>% # mutate( # verification_sample = if_else( # TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L # ) # ) %>% # ungroup() #sentence_level_sample <- labeling_sampled_df |> sentence_level_sample <- sampled_df |> mutate( cleaned_sentences = lapply( olmo_cleaned_sentences, function(x) { m <- str_match_all(x, "'([^']*)'|\"([^\"]*)\"")[[1]] # Both columns 2 and 3 may exist, but only one will be filled for each match vals <- c(m[,2], m[,3]) vals <- vals[vals != "" & !is.na(vals)] return(vals) } ) )|> unnest(cleaned_sentences)|> filter(cleaned_sentences != ", ") |> select(-olmo_sentence_categories, -starts_with("normalized"), -contains("gerrit")) table(sentence_level_sample$verification_sample) (nrow(sentence_level_sample) / 293) * 1.5 length(unique(sentence_level_sample$TaskPHID)) write.csv(sentence_level_sample, "102125_human_info_sample.csv", row.names = FALSE)