1
0
mw-lifecycle-analysis/analysis_data/data_verification_3.R
2025-10-23 13:50:27 -07:00

91 lines
2.7 KiB
R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)
#filter out existing olmo stuff
main_df <- main_df |>
select(-starts_with("olmo"))
#dedupe Task with changed title and duplicate entries
first_rows <- main_df |>
filter(id %in% c(20846, 20847)) |>
distinct(id, .keep_all = TRUE)
others <- main_df |>
filter(!(id %in% c(20846, 20847))) |>
filter(id != 23366)
main_df <- bind_rows(others, first_rows)
desc_info <- main_df %>%
filter(comment_type == "task_description") %>%
group_by(TaskPHID) %>%
ungroup() %>%
transmute(
TaskPHID,
task_desc_author = AuthorPHID,
task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
)
#identifying comments in ADAC set
main_df <- main_df |>
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
left_join(desc_info, by = "TaskPHID") |>
mutate(
ADAC = as.integer(!is.na(task_desc_author) &
AuthorPHID == task_desc_author &
created < task_desc_dateClosed)
)
pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
pca_df <- read.csv(pca_csv, header = TRUE)
pca_df <- pca_df |>
select(starts_with("PC"),
id)
first_join <- main_df|>
left_join(
pca_df,
by = "id"
)
olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_101325_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)
olmo_df <- olmo_df |>
mutate(olmo_cleaned_sentences = cleaned_sentences,
olmo_sentence_labels = sentence_categories)|>
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
second_join <- first_join|>
left_join(
olmo_df,
by = "id"
)
#wrangling human labels
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
#TODO
# [ x ] collate the two samples into one
large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label)
small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label)
human_labels_df <- rbind(large_human_labels_df, small_human_labels_df)
# [ x ] aggregate sentence level rows into comment level
human_labels_reduced <- human_labels_df %>%
group_by(id) %>%
summarise(
cleaned_sentences = list(cleaned_sentences),
human_labels = list(str_squish(human_label)),
.groups = "drop"
)
# [ ] merge into unified data set