library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv" pre_unified_df <- read.csv(pre_unified_csv, header = TRUE) unified_task_df <- pre_unified_df |> filter(comment_type=="task_description") unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description") library(scales) unified_task_df |> count(source, phase, isAuthorWMF) |> mutate(pct = round(n / sum(n), 3)) median_n <- unified_comment_df |> count(AuthorPHID) |> pull(n) |> median(na.rm = TRUE) median_n median_comments <- pre_unified_df |> count(TaskPHID) |> pull(n) |> median(na.rm=TRUE) median_comments human_csv <-"~/analysis_data/102025_human_labels.csv" human_df <- read.csv(human_csv, header = TRUE) cleaned_human_df <- human_df |> mutate(human_label = replace_na(human_label, "NA"), human_label = str_squish(human_label)) |> group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |> summarise(human_labels = list(human_label), .groups = "drop") cleaned_human_df <- cleaned_human_df |> left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID")) median_comments <- human_comment_df |> count(AuthorPHID) |> pull(n) |> median(na.rm=TRUE) median_comments