add scripts for other aggregation and merge tasks
This commit is contained in:
parent
e3748fa55f
commit
0ed72af495
@ -11,6 +11,37 @@ main_df <- read.csv(main_csv, header = TRUE)
|
||||
main_df <- main_df |>
|
||||
select(-starts_with("olmo"))
|
||||
|
||||
#dedupe Task with changed title and duplicate entries
|
||||
first_rows <- main_df |>
|
||||
filter(id %in% c(20846, 20847)) |>
|
||||
distinct(id, .keep_all = TRUE)
|
||||
others <- main_df |>
|
||||
filter(!(id %in% c(20846, 20847))) |>
|
||||
filter(id != 23366)
|
||||
main_df <- bind_rows(others, first_rows)
|
||||
|
||||
|
||||
desc_info <- main_df %>%
|
||||
filter(comment_type == "task_description") %>%
|
||||
group_by(TaskPHID) %>%
|
||||
ungroup() %>%
|
||||
transmute(
|
||||
TaskPHID,
|
||||
task_desc_author = AuthorPHID,
|
||||
task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
|
||||
)
|
||||
|
||||
#identifying comments in ADAC set
|
||||
main_df <- main_df |>
|
||||
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
||||
left_join(desc_info, by = "TaskPHID") |>
|
||||
mutate(
|
||||
ADAC = as.integer(!is.na(task_desc_author) &
|
||||
AuthorPHID == task_desc_author &
|
||||
created < task_desc_dateClosed)
|
||||
)
|
||||
|
||||
|
||||
pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
|
||||
pca_df <- read.csv(pca_csv, header = TRUE)
|
||||
|
||||
@ -45,6 +76,16 @@ large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
||||
small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
|
||||
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
|
||||
#TODO
|
||||
# [ ] collate the two samples into one
|
||||
# [ ] aggregate sentence level rows into comment level
|
||||
# [ x ] collate the two samples into one
|
||||
large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label)
|
||||
small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label)
|
||||
human_labels_df <- rbind(large_human_labels_df, small_human_labels_df)
|
||||
# [ x ] aggregate sentence level rows into comment level
|
||||
human_labels_reduced <- human_labels_df %>%
|
||||
group_by(id) %>%
|
||||
summarise(
|
||||
cleaned_sentences = list(cleaned_sentences),
|
||||
human_labels = list(str_squish(human_label)),
|
||||
.groups = "drop"
|
||||
)
|
||||
# [ ] merge into unified data set
|
||||
36
analysis_data/style_dict_variables.R
Normal file
36
analysis_data/style_dict_variables.R
Normal file
@ -0,0 +1,36 @@
|
||||
library(tidyverse)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
|
||||
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
||||
"ought", "do", "need", "dare",
|
||||
"will not", "may not", "cannot", "shall not",
|
||||
"must not", "do not", "don't", "need not",
|
||||
"dare not", "won't", "can't")
|
||||
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
|
||||
|
||||
main_df <- main_df |>
|
||||
mutate(
|
||||
comment_text = dplyr::coalesce(comment_text, ""), # handle NA
|
||||
modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
|
||||
log1p_mv = log1p(modal_verbs)
|
||||
)
|
||||
|
||||
|
||||
table(main_df$modal_verbs)
|
||||
library(ggdist)
|
||||
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
|
||||
stat_slabinterval() +
|
||||
xlim(0, 5) +
|
||||
labs(
|
||||
title = "Distribution of modal_verbs by isAuthorWMF",
|
||||
x = "Number of modal verbs in comment",
|
||||
y = "isAuthorWMF"
|
||||
) +
|
||||
theme_minimal()
|
||||
Loading…
Reference in New Issue
Block a user