library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) # loading in the first unified set to contain updated affiliation data main_csv <- "~/analysis_data/121625_constituent_dfs/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) length(unique(main_df$TaskPHID)) # dedupe Task with changed title and duplicate replies, no duplicate tasks # duplicates a result of Phabricator merges # should just be removing duplicates from the overlap betweeen c2 and c3 first_rows <- main_df |> filter(id %in% c(20846, 20847)) |> distinct(id, .keep_all = TRUE) others <- main_df |> filter(!(id %in% c(20846, 20847))) |> filter(id != 23366) main_df <- bind_rows(others, first_rows) length(unique(main_df$id)) #filter out existing olmo and PC stuff main_df <- main_df |> select(-starts_with("olmo")) |> select(-starts_with("PC")) # change bzimport affiliation from FALSE to BzImport main_df <- main_df |> mutate(isAuthorWMF = as.character(isAuthorWMF)) |> mutate(isAuthorWMF = if_else( AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v", "BzImport", isAuthorWMF )) # getting old task closure stuff desc_info <- main_df %>% filter(comment_type == "task_description") %>% group_by(TaskPHID) %>% ungroup() %>% transmute( TaskPHID, task_desc_author = AuthorPHID, task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC") ) # getting old status stuff, which was only on 071425 #old_csv <- "~/analysis_data/121625_constituent_dfs/071425_master_discussion_data.csv" #old_df <- read.csv(old_csv, header = TRUE) #duplicates from (c2/c3 overlap) in 0714 version #duplicate_rows <- old_task_status [duplicated(old_task_status$TaskPHID) | # duplicated(old_task_status$TaskPHID, fromLast = TRUE), ] # all duplicates in old_df (c2/c3 overlap) have the same resolution status #conflicting_status <- old_task_status %>% # group_by(TaskPHID) %>% # filter(n() > 1, n_distinct(status) > 1) %>% # ungroup() # as such, squashing down to one row for each #old_task_status <- old_df |> # filter(comment_type == "task_description") |> # select(TaskPHID, status) |> # distinct(TaskPHID, status) #new_desc_info <- desc_info |> # left_join( # old_task_status, # by= "TaskPHID" # ) #identifying comments in ADAC set main_df <- main_df |> mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |> left_join(desc_info, by = "TaskPHID") |> mutate( ADAC = as.integer( !is.na(task_desc_author) & AuthorPHID == task_desc_author & (is.na(task_desc_dateClosed) | created < task_desc_dateClosed) ), before_close = as.integer( (is.na(task_desc_dateClosed) | created < task_desc_dateClosed) ) ) #getting PC values (need todo after revised pass) pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv" pca_df <- read.csv(pca_csv, header = TRUE) length(unique(pca_df$id)) pca_df <- pca_df |> select(starts_with("PC"), id) first_join <- main_df|> left_join( pca_df, by = "id" ) length(unique(first_join$id)) olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv" olmo_df <- read.csv(olmo_csv, header = TRUE) olmo_df <- olmo_df |> mutate(olmo_cleaned_sentences = cleaned_sentences, olmo_sentence_labels = sentence_categories)|> select(id, olmo_cleaned_sentences, olmo_sentence_labels) second_join <- first_join |> left_join( olmo_df, by = "id" ) #wrangling human labels large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv" large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) small_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102125_human_info_sample.csv" small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) #TODO # [ x ] collate the two samples into one large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label) small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label) human_labels_df <- rbind(large_human_labels_df, small_human_labels_df) # [ x ] aggregate sentence level rows into comment level human_labels_reduced <- human_labels_df %>% group_by(id) %>% summarise( cleaned_sentences = list(cleaned_sentences), human_labels = list(str_squish(human_label)), .groups = "drop" ) # [ x ] merge into unified data set third_join <- second_join |> left_join( human_labels_reduced, by="id" ) # [ x ] clean/drop needless fields unified_df <- third_join |> select(-same_author) |> mutate(across(c(human_labels, cleaned_sentences), ~ { x <- as.character(.x) x_trim <- str_squish(x) ifelse(x_trim == "NULL", NA_character_, x) })) # [ x ] verify set length(unique(unified_df$TaskPHID)) length(unique(unified_df$id)) pulling <- unified_df |> filter(id == "24695" | id == "24696") pulling <- unified_df |> filter(id == "23366" | id == "20846" | id == "20847") # [ x ] get the focal repo for gerrit code changes unified_df <- unified_df |> mutate( gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+") ) write.csv(unified_df, "121625_unified.csv", row.names = FALSE)