1
0
mw-lifecycle-analysis/analysis_data/data_verification.R
2025-09-29 14:10:39 -07:00

119 lines
4.1 KiB
R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
# TODO
# join the label data with the existing data from 0714 master
# download and set up DSL library
# figure out how to use the sentence-level variables
# get the categorical variables encoded as integers, then wrapped as factors
# figure out power at 200, 400, 500, 750, and 1000
#joining sentences with their
labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
labeled_df <- read.csv(labeled_csv, header = TRUE)
main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
main_df <- read.csv(main_csv, header = TRUE)
dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
dupes_main <- main_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
labeled_df_unique <- labeled_df %>%
distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE)
main_df_unique <- main_df %>%
distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE)
joined_df <- left_join(
labeled_df_unique,
main_df_unique,
by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type")
)
dupes_joined <- joined_df %>% count(id, comment_text, date_created) %>% filter(n > 1)
joined_df |>
filter(id=="1692131")
joined_df_unique <- joined_df |>
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
# TODO
# bring in gerrit data
# bring in OLMO data
# re-id
gerrit_csv = "~/analysis_data/constituent_dfs/080425_gerrit_filled_df.csv"
gerrit_df <- read.csv(gerrit_csv, header = TRUE)
dupes_gerrit <- gerrit_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
gerrit_df <- gerrit_df |>
mutate(
gerrit_status = str_match(selected_gerrit_results, "'status':\\s*'([^']*)',\\s*'reviewer")[,2],
gerrit_owner_email = str_match(selected_gerrit_results, "'owner_email':\\s*'([^']*)'")[,2],
gerrit_url_in_message = str_match(selected_gerrit_results, "'written_url_in_message':\\s*'([^']*)'")[,2],
gerrit_code_insertions = as.integer(str_match(selected_gerrit_results, "'code_insertions':\\s*(\\d+)")[,2]),
gerrit_code_deletions = as.integer(str_match(selected_gerrit_results, "'code_deletions':\\s*(\\d+)")[,2]),
gerrit_reviewer_count = as.integer(str_match(selected_gerrit_results, "'reviewer_count':\\s*(\\d+)")[,2])
)
gerrit_df_unique <- gerrit_df %>%
distinct(comment_text, date_created, TaskPHID, AuthorPHID, .keep_all = TRUE)
joined_2_df <- left_join(
joined_df_unique,
gerrit_df_unique,
by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type")
)
joined_2_df_unique <- joined_2_df |>
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
olmo_csv = "~/analysis_data/constituent_dfs/all_092225_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)
dupes_olmo <- olmo_df %>% count(task_title, id, comment_text, comment_type) %>% filter(n > 1)
olmo_df <- olmo_df |>
mutate(
olmo_cleaned_sentences = cleaned_sentences,
olmo_sentence_categories = sentence_categories
) |>
select(
-cleaned_sentences,
-sentence_categories
)
olmo_df_unique <- olmo_df %>%
distinct(task_title, id, comment_text, comment_type, .keep_all = TRUE)
joined_3_df <- left_join(
joined_2_df_unique,
olmo_df_unique,
by = c("task_title", "id", "comment_text", "comment_type")
)
joined_3_df_unique <- joined_3_df |>
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
joined_3_df_unique <- joined_3_df_unique %>%
mutate(id = row_number())
joined_3_df_unique <- joined_3_df_unique |>
select(
-AuthorWMFAffil,
-CloserWMFAffil,
-ends_with(".y"),
-ends_with(".x")
) |>
mutate(across(
everything(),
~ ifelse(.x == "" | .x == "[]", NA, .x)
))
length(unique(joined_3_df_unique$TaskPHID))
length(unique(joined_3_df_unique$id))
write.csv(joined_3_df_unique, "092925_unified_phab.csv", row.names = FALSE)