library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) main_csv <- "~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) #filter out existing olmo stuff main_df <- main_df |> select(-starts_with("olmo")) pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv" pca_df <- read.csv(pca_csv, header = TRUE) pca_df <- pca_df |> select(starts_with("PC"), id) first_join <- main_df|> left_join( pca_df, by = "id" ) olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_101325_olmo_batched_categorized.csv" olmo_df <- read.csv(olmo_csv, header = TRUE) olmo_df <- olmo_df |> mutate(olmo_cleaned_sentences = cleaned_sentences, olmo_sentence_labels = sentence_categories)|> select(id, olmo_cleaned_sentences, olmo_sentence_labels) second_join <- first_join|> left_join( olmo_df, by = "id" ) #wrangling human labels large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv" large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv" small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) #TODO # [ ] collate the two samples into one # [ ] aggregate sentence level rows into comment level # [ ] merge into unified data set