library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) #filter out existing olmo stuff main_df <- main_df |> select(-starts_with("olmo")) #dedupe Task with changed title and duplicate entries first_rows <- main_df |> filter(id %in% c(20846, 20847)) |> distinct(id, .keep_all = TRUE) others <- main_df |> filter(!(id %in% c(20846, 20847))) |> filter(id != 23366) main_df <- bind_rows(others, first_rows) desc_info <- main_df %>% filter(comment_type == "task_description") %>% group_by(TaskPHID) %>% ungroup() %>% transmute( TaskPHID, task_desc_author = AuthorPHID, task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC") ) #identifying comments in ADAC set main_df <- main_df |> mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> left_join(desc_info, by = "TaskPHID") |> mutate( ADAC = as.integer( !is.na(task_desc_author) & AuthorPHID == task_desc_author & (is.na(task_desc_dateClosed) | created < task_desc_dateClosed) ) ) # add dictionary values modal_verb_list <- c("will", "may", "can", "shall", "must", "ought", "do", "need", "dare", "will not", "may not", "cannot", "shall not", "must not", "do not", "don't", "need not", "dare not", "won't", "can't") modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b") main_df <- main_df |> mutate( comment_text = dplyr::coalesce(comment_text, ""), # handle NA modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)), log1p_mv = log1p(modal_verbs) ) pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv" pca_df <- read.csv(pca_csv, header = TRUE) pca_df <- pca_df |> select(starts_with("PC"), id) first_join <- main_df|> left_join( pca_df, by = "id" ) olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_102125_olmo_batched_categorized.csv" olmo_df <- read.csv(olmo_csv, header = TRUE) olmo_df <- olmo_df |> mutate(olmo_cleaned_sentences = cleaned_sentences, olmo_sentence_labels = sentence_categories)|> select(id, olmo_cleaned_sentences, olmo_sentence_labels) second_join <- first_join|> left_join( olmo_df, by = "id" ) #wrangling human labels large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv" large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv" small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) #TODO # [ x ] collate the two samples into one large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label) small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label) human_labels_df <- rbind(large_human_labels_df, small_human_labels_df) # [ x ] aggregate sentence level rows into comment level human_labels_reduced <- human_labels_df %>% group_by(id) %>% summarise( cleaned_sentences = list(cleaned_sentences), human_labels = list(str_squish(human_label)), .groups = "drop" ) # [ x ] merge into unified data set third_join <- second_join |> left_join( human_labels_reduced, by="id" ) # [ x ] clean/drop needless fields unified_df <- third_join |> select(-same_author) |> mutate(across(c(human_labels, cleaned_sentences), ~ { x <- as.character(.x) x_trim <- str_squish(x) ifelse(x_trim == "NULL", NA_character_, x) })) # [ x ] verify set length(unique(unified_df$TaskPHID)) length(unique(unified_df$id)) pulling <- unified_df |> filter(id == "24695" | id == "24696") pulling <- unified_df |> filter(id == "23366" | id == "20846" | id == "20847") write.csv(unified_df, "102725_unified.csv", row.names = FALSE)