From 0ed72af495a0b1e0e5c665198c4c963869c18a2e Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 23 Oct 2025 13:50:27 -0700 Subject: [PATCH] add scripts for other aggregation and merge tasks --- analysis_data/data_verification_3.R | 45 ++++++++++++++++++++++++++-- analysis_data/style_dict_variables.R | 36 ++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 analysis_data/style_dict_variables.R diff --git a/analysis_data/data_verification_3.R b/analysis_data/data_verification_3.R index bc02f48..c60e722 100644 --- a/analysis_data/data_verification_3.R +++ b/analysis_data/data_verification_3.R @@ -11,6 +11,37 @@ main_df <- read.csv(main_csv, header = TRUE) main_df <- main_df |> select(-starts_with("olmo")) +#dedupe Task with changed title and duplicate entries +first_rows <- main_df |> + filter(id %in% c(20846, 20847)) |> + distinct(id, .keep_all = TRUE) +others <- main_df |> + filter(!(id %in% c(20846, 20847))) |> + filter(id != 23366) +main_df <- bind_rows(others, first_rows) + + +desc_info <- main_df %>% + filter(comment_type == "task_description") %>% + group_by(TaskPHID) %>% + ungroup() %>% + transmute( + TaskPHID, + task_desc_author = AuthorPHID, + task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC") + ) + +#identifying comments in ADAC set +main_df <- main_df |> + mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> + left_join(desc_info, by = "TaskPHID") |> + mutate( + ADAC = as.integer(!is.na(task_desc_author) & + AuthorPHID == task_desc_author & + created < task_desc_dateClosed) + ) + + pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv" pca_df <- read.csv(pca_csv, header = TRUE) @@ -45,6 +76,16 @@ large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv" small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) #TODO -# [ ] collate the two samples into one -# [ ] aggregate sentence level rows into comment level +# [ x ] collate the two samples into one +large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label) +small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label) +human_labels_df <- rbind(large_human_labels_df, small_human_labels_df) +# [ x ] aggregate sentence level rows into comment level +human_labels_reduced <- human_labels_df %>% + group_by(id) %>% + summarise( + cleaned_sentences = list(cleaned_sentences), + human_labels = list(str_squish(human_label)), + .groups = "drop" + ) # [ ] merge into unified data set \ No newline at end of file diff --git a/analysis_data/style_dict_variables.R b/analysis_data/style_dict_variables.R new file mode 100644 index 0000000..ff5c4cb --- /dev/null +++ b/analysis_data/style_dict_variables.R @@ -0,0 +1,36 @@ +library(tidyverse) +library(stringr) +library(tidyr) +library(dplyr) +library(purrr) + +main_csv <- "~/analysis_data/100625_unified_w_affil.csv" +main_df <- read.csv(main_csv, header = TRUE) + + +modal_verb_list <- c("will", "may", "can", "shall", "must", + "ought", "do", "need", "dare", + "will not", "may not", "cannot", "shall not", + "must not", "do not", "don't", "need not", + "dare not", "won't", "can't") +modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b") + +main_df <- main_df |> + mutate( + comment_text = dplyr::coalesce(comment_text, ""), # handle NA + modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)), + log1p_mv = log1p(modal_verbs) + ) + + +table(main_df$modal_verbs) +library(ggdist) +ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) + + stat_slabinterval() + + xlim(0, 5) + + labs( + title = "Distribution of modal_verbs by isAuthorWMF", + x = "Number of modal verbs in comment", + y = "isAuthorWMF" + ) + + theme_minimal()