library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) main_csv <- "~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) #filter out existing olmo stuff main_df <- main_df |> select(-starts_with("olmo")) #dedupe Task with changed title and duplicate entries first_rows <- main_df |> filter(id %in% c(20846, 20847)) |> distinct(id, .keep_all = TRUE) others <- main_df |> filter(!(id %in% c(20846, 20847))) |> filter(id != 23366) main_df <- bind_rows(others, first_rows) desc_info <- main_df %>% filter(comment_type == "task_description") %>% group_by(TaskPHID) %>% ungroup() %>% transmute( TaskPHID, task_desc_author = AuthorPHID, task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC") ) #identifying comments in ADAC set main_df <- main_df |> mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> left_join(desc_info, by = "TaskPHID") |> mutate( ADAC = as.integer(!is.na(task_desc_author) & AuthorPHID == task_desc_author & created < task_desc_dateClosed) ) pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv" pca_df <- read.csv(pca_csv, header = TRUE) pca_df <- pca_df |> select(starts_with("PC"), id) first_join <- main_df|> left_join( pca_df, by = "id" ) olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_101325_olmo_batched_categorized.csv" olmo_df <- read.csv(olmo_csv, header = TRUE) olmo_df <- olmo_df |> mutate(olmo_cleaned_sentences = cleaned_sentences, olmo_sentence_labels = sentence_categories)|> select(id, olmo_cleaned_sentences, olmo_sentence_labels) second_join <- first_join|> left_join( olmo_df, by = "id" ) #wrangling human labels large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv" large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv" small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) #TODO # [ x ] collate the two samples into one large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label) small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label) human_labels_df <- rbind(large_human_labels_df, small_human_labels_df) # [ x ] aggregate sentence level rows into comment level human_labels_reduced <- human_labels_df %>% group_by(id) %>% summarise( cleaned_sentences = list(cleaned_sentences), human_labels = list(str_squish(human_label)), .groups = "drop" ) # [ ] merge into unified data set