mw-lifecycle-analysis/analysis_data/data_verification_3.R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)

main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)

#filter out existing olmo stuff
main_df <- main_df |>
  select(-starts_with("olmo"))

#dedupe Task with changed title and duplicate entries
first_rows <- main_df |>
  filter(id %in% c(20846, 20847)) |>
  distinct(id, .keep_all = TRUE)
others <- main_df |>
  filter(!(id %in% c(20846, 20847))) |>
  filter(id != 23366)
main_df <- bind_rows(others, first_rows)


desc_info <- main_df %>%
  filter(comment_type == "task_description") %>%
  group_by(TaskPHID) %>%
  ungroup() %>%
  transmute(
    TaskPHID,
    task_desc_author = AuthorPHID,
    task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
  )

#identifying comments in ADAC set
main_df <- main_df |>
  mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
  left_join(desc_info, by = "TaskPHID") |>
  mutate(
    ADAC = as.integer(
      !is.na(task_desc_author) &
        AuthorPHID == task_desc_author &
        (is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
    )
  )
# add dictionary values
modal_verb_list <- c("will", "may", "can", "shall", "must",
                     "ought", "do", "need", "dare",
                     "will not", "may not", "cannot", "shall not",
                     "must not", "do not", "don't", "need not",
                     "dare not", "won't", "can't")
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")

main_df <- main_df |>
  mutate(
    comment_text = dplyr::coalesce(comment_text, ""), # handle NA
    modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
    log1p_mv = log1p(modal_verbs)
  )

pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
pca_df <- read.csv(pca_csv, header = TRUE)

pca_df <- pca_df |>
  select(starts_with("PC"),
         id)

first_join <- main_df|>
  left_join(
    pca_df,
    by = "id"
  )

olmo_csv <- "~/analysis_data/102125_constituent_dfs/all_102125_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)

olmo_df <- olmo_df |>
  mutate(olmo_cleaned_sentences = cleaned_sentences,
         olmo_sentence_labels = sentence_categories)|>
  select(id, olmo_cleaned_sentences, olmo_sentence_labels)

second_join <- first_join|>
  left_join(
    olmo_df,
    by = "id"
  )

#wrangling human labels
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)

small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
#TODO
# [ x ] collate the two samples into one
large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label)
small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label)
human_labels_df <- rbind(large_human_labels_df, small_human_labels_df)
# [ x ] aggregate sentence level rows into comment level
human_labels_reduced <- human_labels_df %>%
  group_by(id) %>%
  summarise(
    cleaned_sentences = list(cleaned_sentences),
    human_labels       = list(str_squish(human_label)),
    .groups = "drop"
  )
# [ x ] merge into unified data set
third_join <- second_join |>
  left_join(
    human_labels_reduced,
    by="id"
  )

# [ x ] clean/drop needless fields
unified_df <- third_join |>
  select(-same_author) |>
  mutate(across(c(human_labels, cleaned_sentences),
                ~ {
                  x <- as.character(.x)
                  x_trim <- str_squish(x)
                  ifelse(x_trim == "NULL",
                         NA_character_,
                         x)
                }))

# [ x ] verify set
length(unique(unified_df$TaskPHID))
length(unique(unified_df$id))

pulling <- unified_df |>
  filter(id == "24695" | id == "24696")

pulling <- unified_df |>
  filter(id == "23366" | id == "20846" | id == "20847")

write.csv(unified_df, "102725_unified.csv", row.names = FALSE)