1
0

updating collation scripts, more work TODO

This commit is contained in:
Matthew Gaughan 2025-10-21 19:41:36 -07:00
parent 90311ca136
commit e3748fa55f
11 changed files with 1999 additions and 1949 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)
#filter out existing olmo stuff
main_df <- main_df |>
select(-starts_with("olmo"))
pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
pca_df <- read.csv(pca_csv, header = TRUE)
pca_df <- pca_df |>
select(starts_with("PC"),
id)
first_join <- main_df|>
left_join(
pca_df,
by = "id"
)
olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_101325_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)
olmo_df <- olmo_df |>
mutate(olmo_cleaned_sentences = cleaned_sentences,
olmo_sentence_labels = sentence_categories)|>
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
second_join <- first_join|>
left_join(
olmo_df,
by = "id"
)
#wrangling human labels
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
#TODO
# [ ] collate the two samples into one
# [ ] aggregate sentence level rows into comment level
# [ ] merge into unified data set