updating collation scripts, more work TODO
This commit is contained in:
parent
90311ca136
commit
e3748fa55f
File diff suppressed because it is too large
Load Diff
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
1949
analysis_data/102125_constituent_dfs/102125_human_info_sample.csv
Normal file
1949
analysis_data/102125_constituent_dfs/102125_human_info_sample.csv
Normal file
File diff suppressed because it is too large
Load Diff
|
Can't render this file because it is too large.
|
50
analysis_data/data_verification_3.R
Normal file
50
analysis_data/data_verification_3.R
Normal file
@ -0,0 +1,50 @@
|
||||
library(tidyverse)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
#filter out existing olmo stuff
|
||||
main_df <- main_df |>
|
||||
select(-starts_with("olmo"))
|
||||
|
||||
pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
|
||||
pca_df <- read.csv(pca_csv, header = TRUE)
|
||||
|
||||
pca_df <- pca_df |>
|
||||
select(starts_with("PC"),
|
||||
id)
|
||||
|
||||
first_join <- main_df|>
|
||||
left_join(
|
||||
pca_df,
|
||||
by = "id"
|
||||
)
|
||||
|
||||
olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_101325_olmo_batched_categorized.csv"
|
||||
olmo_df <- read.csv(olmo_csv, header = TRUE)
|
||||
|
||||
olmo_df <- olmo_df |>
|
||||
mutate(olmo_cleaned_sentences = cleaned_sentences,
|
||||
olmo_sentence_labels = sentence_categories)|>
|
||||
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
|
||||
|
||||
second_join <- first_join|>
|
||||
left_join(
|
||||
olmo_df,
|
||||
by = "id"
|
||||
)
|
||||
|
||||
#wrangling human labels
|
||||
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
|
||||
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
||||
|
||||
small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
|
||||
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
|
||||
#TODO
|
||||
# [ ] collate the two samples into one
|
||||
# [ ] aggregate sentence level rows into comment level
|
||||
# [ ] merge into unified data set
|
||||
Loading…
Reference in New Issue
Block a user