library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) # TODO # join the label data with the existing data from 0714 master # download and set up DSL library # figure out how to use the sentence-level variables # get the categorical variables encoded as integers, then wrapped as factors # figure out power at 200, 400, 500, 750, and 1000 #joining sentences with their labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv" labeled_df <- read.csv(labeled_csv, header = TRUE) main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv" main_df <- read.csv(main_csv, header = TRUE) dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1) dupes_main <- main_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1) labeled_df_unique <- labeled_df %>% distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE) main_df_unique <- main_df %>% distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE) joined_df <- left_join( labeled_df_unique, main_df_unique, by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type") ) dupes_joined <- joined_df %>% count(id, comment_text, date_created) %>% filter(n > 1) joined_df |> filter(id=="1692131") joined_df_unique <- joined_df |> distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE) # TODO # bring in gerrit data # bring in OLMO data # re-id gerrit_csv = "~/analysis_data/constituent_dfs/080425_gerrit_filled_df.csv" gerrit_df <- read.csv(gerrit_csv, header = TRUE) dupes_gerrit <- gerrit_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1) gerrit_df <- gerrit_df |> mutate( gerrit_status = str_match(selected_gerrit_results, "'status':\\s*'([^']*)',\\s*'reviewer")[,2], gerrit_owner_email = str_match(selected_gerrit_results, "'owner_email':\\s*'([^']*)'")[,2], gerrit_url_in_message = str_match(selected_gerrit_results, "'written_url_in_message':\\s*'([^']*)'")[,2], gerrit_code_insertions = as.integer(str_match(selected_gerrit_results, "'code_insertions':\\s*(\\d+)")[,2]), gerrit_code_deletions = as.integer(str_match(selected_gerrit_results, "'code_deletions':\\s*(\\d+)")[,2]), gerrit_reviewer_count = as.integer(str_match(selected_gerrit_results, "'reviewer_count':\\s*(\\d+)")[,2]) ) gerrit_df_unique <- gerrit_df %>% distinct(comment_text, date_created, TaskPHID, AuthorPHID, .keep_all = TRUE) joined_2_df <- left_join( joined_df_unique, gerrit_df_unique, by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type") ) joined_2_df_unique <- joined_2_df |> distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE) olmo_csv = "~/analysis_data/constituent_dfs/all_092225_olmo_batched_categorized.csv" olmo_df <- read.csv(olmo_csv, header = TRUE) dupes_olmo <- olmo_df %>% count(task_title, id, comment_text, comment_type) %>% filter(n > 1) olmo_df <- olmo_df |> mutate( olmo_cleaned_sentences = cleaned_sentences, olmo_sentence_categories = sentence_categories ) |> select( -cleaned_sentences, -sentence_categories ) olmo_df_unique <- olmo_df %>% distinct(task_title, id, comment_text, comment_type, .keep_all = TRUE) joined_3_df <- left_join( joined_2_df_unique, olmo_df_unique, by = c("task_title", "id", "comment_text", "comment_type") ) joined_3_df_unique <- joined_3_df |> distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE) joined_3_df_unique <- joined_3_df_unique %>% mutate(id = row_number()) joined_3_df_unique <- joined_3_df_unique |> select( -AuthorWMFAffil, -CloserWMFAffil, -ends_with(".y"), -ends_with(".x") ) |> mutate(across( everything(), ~ ifelse(.x == "" | .x == "[]", NA, .x) )) length(unique(joined_3_df_unique$TaskPHID)) length(unique(joined_3_df_unique$id)) write.csv(joined_3_df_unique, "092925_unified_phab.csv", row.names = FALSE)