50 lines
1.3 KiB
R
50 lines
1.3 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
|
|
pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
|
pre_unified_df <- read.csv(pre_unified_csv, header = TRUE)
|
|
|
|
unified_task_df <- pre_unified_df |> filter(comment_type=="task_description")
|
|
unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description")
|
|
|
|
library(scales)
|
|
|
|
unified_task_df |>
|
|
count(source, phase, isAuthorWMF) |>
|
|
mutate(pct = round(n / sum(n), 3))
|
|
|
|
median_n <- unified_comment_df |>
|
|
count(AuthorPHID) |>
|
|
pull(n) |>
|
|
median(na.rm = TRUE)
|
|
median_n
|
|
|
|
median_comments <- pre_unified_df |>
|
|
count(TaskPHID) |>
|
|
pull(n) |>
|
|
median(na.rm=TRUE)
|
|
median_comments
|
|
|
|
human_csv <-"~/analysis_data/102025_human_labels.csv"
|
|
human_df <- read.csv(human_csv, header = TRUE)
|
|
|
|
cleaned_human_df <- human_df |>
|
|
mutate(human_label = replace_na(human_label, "NA"),
|
|
human_label = str_squish(human_label)) |>
|
|
group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |>
|
|
summarise(human_labels = list(human_label), .groups = "drop")
|
|
|
|
cleaned_human_df <- cleaned_human_df |>
|
|
left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID"))
|
|
|
|
|
|
|
|
median_comments <- human_comment_df |>
|
|
count(AuthorPHID) |>
|
|
pull(n) |>
|
|
median(na.rm=TRUE)
|
|
median_comments
|