167 lines
5.4 KiB
R
167 lines
5.4 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
# loading in the first unified set to contain updated affiliation data
|
|
main_csv <- "~/analysis_data/121625_constituent_dfs/100625_unified_w_affil.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
length(unique(main_df$TaskPHID))
|
|
# dedupe Task with changed title and duplicate replies, no duplicate tasks
|
|
# duplicates a result of Phabricator merges
|
|
# should just be removing duplicates from the overlap betweeen c2 and c3
|
|
first_rows <- main_df |>
|
|
filter(id %in% c(20846, 20847)) |>
|
|
distinct(id, .keep_all = TRUE)
|
|
others <- main_df |>
|
|
filter(!(id %in% c(20846, 20847))) |>
|
|
filter(id != 23366)
|
|
main_df <- bind_rows(others, first_rows)
|
|
length(unique(main_df$id))
|
|
|
|
#filter out existing olmo and PC stuff
|
|
main_df <- main_df |>
|
|
select(-starts_with("olmo")) |>
|
|
select(-starts_with("PC"))
|
|
|
|
# change bzimport affiliation from FALSE to BzImport
|
|
main_df <- main_df |>
|
|
mutate(isAuthorWMF = as.character(isAuthorWMF)) |>
|
|
mutate(isAuthorWMF = if_else(
|
|
AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v",
|
|
"BzImport",
|
|
isAuthorWMF
|
|
))
|
|
|
|
# getting old task closure stuff
|
|
desc_info <- main_df %>%
|
|
filter(comment_type == "task_description") %>%
|
|
group_by(TaskPHID) %>%
|
|
ungroup() %>%
|
|
transmute(
|
|
TaskPHID,
|
|
task_desc_author = AuthorPHID,
|
|
task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
|
|
)
|
|
# getting old status stuff, which was only on 071425
|
|
#old_csv <- "~/analysis_data/121625_constituent_dfs/071425_master_discussion_data.csv"
|
|
#old_df <- read.csv(old_csv, header = TRUE)
|
|
#duplicates from (c2/c3 overlap) in 0714 version
|
|
#duplicate_rows <- old_task_status [duplicated(old_task_status$TaskPHID) |
|
|
# duplicated(old_task_status$TaskPHID, fromLast = TRUE), ]
|
|
# all duplicates in old_df (c2/c3 overlap) have the same resolution status
|
|
#conflicting_status <- old_task_status %>%
|
|
# group_by(TaskPHID) %>%
|
|
# filter(n() > 1, n_distinct(status) > 1) %>%
|
|
# ungroup()
|
|
# as such, squashing down to one row for each
|
|
|
|
#old_task_status <- old_df |>
|
|
# filter(comment_type == "task_description") |>
|
|
# select(TaskPHID, status) |>
|
|
# distinct(TaskPHID, status)
|
|
|
|
#new_desc_info <- desc_info |>
|
|
# left_join(
|
|
# old_task_status,
|
|
# by= "TaskPHID"
|
|
# )
|
|
#identifying comments in ADAC set
|
|
main_df <- main_df |>
|
|
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
|
left_join(desc_info, by = "TaskPHID") |>
|
|
mutate(
|
|
ADAC = as.integer(
|
|
!is.na(task_desc_author) &
|
|
AuthorPHID == task_desc_author &
|
|
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
|
|
),
|
|
before_close = as.integer(
|
|
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
|
|
)
|
|
)
|
|
|
|
#getting PC values (need todo after revised pass)
|
|
pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
|
|
pca_df <- read.csv(pca_csv, header = TRUE)
|
|
length(unique(pca_df$id))
|
|
pca_df <- pca_df |>
|
|
select(starts_with("PC"),
|
|
id)
|
|
|
|
first_join <- main_df|>
|
|
left_join(
|
|
pca_df,
|
|
by = "id"
|
|
)
|
|
|
|
length(unique(first_join$id))
|
|
olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
|
|
olmo_df <- read.csv(olmo_csv, header = TRUE)
|
|
|
|
olmo_df <- olmo_df |>
|
|
mutate(olmo_cleaned_sentences = cleaned_sentences,
|
|
olmo_sentence_labels = sentence_categories)|>
|
|
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
|
|
|
|
second_join <- first_join |>
|
|
left_join(
|
|
olmo_df,
|
|
by = "id"
|
|
)
|
|
|
|
#wrangling human labels
|
|
large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
|
|
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
|
|
|
small_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102125_human_info_sample.csv"
|
|
small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE)
|
|
#TODO
|
|
# [ x ] collate the two samples into one
|
|
large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label)
|
|
small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label)
|
|
human_labels_df <- rbind(large_human_labels_df, small_human_labels_df)
|
|
# [ x ] aggregate sentence level rows into comment level
|
|
human_labels_reduced <- human_labels_df %>%
|
|
group_by(id) %>%
|
|
summarise(
|
|
cleaned_sentences = list(cleaned_sentences),
|
|
human_labels = list(str_squish(human_label)),
|
|
.groups = "drop"
|
|
)
|
|
# [ x ] merge into unified data set
|
|
third_join <- second_join |>
|
|
left_join(
|
|
human_labels_reduced,
|
|
by="id"
|
|
)
|
|
|
|
# [ x ] clean/drop needless fields
|
|
unified_df <- third_join |>
|
|
select(-same_author) |>
|
|
mutate(across(c(human_labels, cleaned_sentences),
|
|
~ {
|
|
x <- as.character(.x)
|
|
x_trim <- str_squish(x)
|
|
ifelse(x_trim == "NULL",
|
|
NA_character_,
|
|
x)
|
|
}))
|
|
|
|
# [ x ] verify set
|
|
length(unique(unified_df$TaskPHID))
|
|
length(unique(unified_df$id))
|
|
|
|
pulling <- unified_df |>
|
|
filter(id == "24695" | id == "24696")
|
|
|
|
pulling <- unified_df |>
|
|
filter(id == "23366" | id == "20846" | id == "20847")
|
|
|
|
# [ x ] get the focal repo for gerrit code changes
|
|
unified_df <- unified_df |>
|
|
mutate(
|
|
gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
|
|
)
|
|
|
|
write.csv(unified_df, "121625_unified.csv", row.names = FALSE) |