119 lines
4.1 KiB
R
119 lines
4.1 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
# TODO
|
|
# join the label data with the existing data from 0714 master
|
|
# download and set up DSL library
|
|
# figure out how to use the sentence-level variables
|
|
# get the categorical variables encoded as integers, then wrapped as factors
|
|
# figure out power at 200, 400, 500, 750, and 1000
|
|
#joining sentences with their
|
|
labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
|
|
labeled_df <- read.csv(labeled_csv, header = TRUE)
|
|
|
|
main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
|
|
dupes_main <- main_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
|
|
|
|
labeled_df_unique <- labeled_df %>%
|
|
distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE)
|
|
|
|
main_df_unique <- main_df %>%
|
|
distinct(comment_text, date_created, task_title, AuthorPHID, .keep_all = TRUE)
|
|
|
|
joined_df <- left_join(
|
|
labeled_df_unique,
|
|
main_df_unique,
|
|
by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type")
|
|
)
|
|
|
|
dupes_joined <- joined_df %>% count(id, comment_text, date_created) %>% filter(n > 1)
|
|
|
|
joined_df |>
|
|
filter(id=="1692131")
|
|
|
|
joined_df_unique <- joined_df |>
|
|
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
|
|
|
|
# TODO
|
|
# bring in gerrit data
|
|
# bring in OLMO data
|
|
# re-id
|
|
gerrit_csv = "~/analysis_data/constituent_dfs/080425_gerrit_filled_df.csv"
|
|
gerrit_df <- read.csv(gerrit_csv, header = TRUE)
|
|
|
|
dupes_gerrit <- gerrit_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
|
|
|
|
gerrit_df <- gerrit_df |>
|
|
mutate(
|
|
gerrit_status = str_match(selected_gerrit_results, "'status':\\s*'([^']*)',\\s*'reviewer")[,2],
|
|
gerrit_owner_email = str_match(selected_gerrit_results, "'owner_email':\\s*'([^']*)'")[,2],
|
|
gerrit_url_in_message = str_match(selected_gerrit_results, "'written_url_in_message':\\s*'([^']*)'")[,2],
|
|
gerrit_code_insertions = as.integer(str_match(selected_gerrit_results, "'code_insertions':\\s*(\\d+)")[,2]),
|
|
gerrit_code_deletions = as.integer(str_match(selected_gerrit_results, "'code_deletions':\\s*(\\d+)")[,2]),
|
|
gerrit_reviewer_count = as.integer(str_match(selected_gerrit_results, "'reviewer_count':\\s*(\\d+)")[,2])
|
|
)
|
|
|
|
gerrit_df_unique <- gerrit_df %>%
|
|
distinct(comment_text, date_created, TaskPHID, AuthorPHID, .keep_all = TRUE)
|
|
|
|
joined_2_df <- left_join(
|
|
joined_df_unique,
|
|
gerrit_df_unique,
|
|
by = c("comment_text", "date_created", "task_title", "id", "AuthorPHID", "TaskPHID", "comment_type")
|
|
)
|
|
|
|
joined_2_df_unique <- joined_2_df |>
|
|
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
|
|
|
|
olmo_csv = "~/analysis_data/constituent_dfs/all_092225_olmo_batched_categorized.csv"
|
|
olmo_df <- read.csv(olmo_csv, header = TRUE)
|
|
|
|
dupes_olmo <- olmo_df %>% count(task_title, id, comment_text, comment_type) %>% filter(n > 1)
|
|
|
|
olmo_df <- olmo_df |>
|
|
mutate(
|
|
olmo_cleaned_sentences = cleaned_sentences,
|
|
olmo_sentence_categories = sentence_categories
|
|
) |>
|
|
select(
|
|
-cleaned_sentences,
|
|
-sentence_categories
|
|
)
|
|
|
|
olmo_df_unique <- olmo_df %>%
|
|
distinct(task_title, id, comment_text, comment_type, .keep_all = TRUE)
|
|
|
|
joined_3_df <- left_join(
|
|
joined_2_df_unique,
|
|
olmo_df_unique,
|
|
by = c("task_title", "id", "comment_text", "comment_type")
|
|
)
|
|
|
|
joined_3_df_unique <- joined_3_df |>
|
|
distinct(comment_text, date_created, TaskPHID, AuthorPHID, id, .keep_all = TRUE)
|
|
|
|
joined_3_df_unique <- joined_3_df_unique %>%
|
|
mutate(id = row_number())
|
|
|
|
joined_3_df_unique <- joined_3_df_unique |>
|
|
select(
|
|
-AuthorWMFAffil,
|
|
-CloserWMFAffil,
|
|
-ends_with(".y"),
|
|
-ends_with(".x")
|
|
) |>
|
|
mutate(across(
|
|
everything(),
|
|
~ ifelse(.x == "" | .x == "[]", NA, .x)
|
|
))
|
|
|
|
length(unique(joined_3_df_unique$TaskPHID))
|
|
length(unique(joined_3_df_unique$id))
|
|
|
|
write.csv(joined_3_df_unique, "092925_unified_phab.csv", row.names = FALSE)
|