library(tidyverse) # load in the human labels and for each task filer, @ the task level # GET the proportion of Observed bug behavior + Expected Behavior # GET the proportion of Solution Discussion + Solution Usage human_csv <-"~/dsl/092225_info_matt_labels.csv" human_df <- read.csv(human_csv, header = TRUE) #task_authors <- human_df %>% # filter(comment_type == "task_description") %>% # select(TaskPHID, AuthorPHID) %>% # rename(Task_AuthorPHID = AuthorPHID) #result <- task_authors %>% # rowwise() %>% # mutate( # bug_prop = { # rows_by_author <- human_df %>% filter(AuthorPHID == task_authorPHID) # mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior")) # } # ) %>% # ungroup() human_result <- human_df %>% group_by(TaskPHID) %>% summarise( human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")), human_SOL_prop = mean(human_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE")) ) # load in the existing df and @ the task level get # for each task filer, # GET the proportion of Observed bug behavior + Expected Behavior # GET the proportion of Solution Discussion + Solution Usage main_csv <-"~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) closed_relevance_summary <- main_df %>% filter(comment_type == "task_description") %>% select(TaskPHID, resolution_outcome, priority_score, priority, source, phase, week_index, isAuthorWMF) # TODO: need to get the TaskAuthor's comments, not just the big picture machine_result <- main_df %>% mutate(olmo_label = str_extract_all(olmo_sentence_categories, "(?<=')[^']+(?=')")) |> unnest(olmo_label) |> group_by(TaskPHID) |> summarise( olmo_BE_prop = mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")), olmo_SOL_prop = mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE")), median_gerrit_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE), median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE), n_comments = sum(comment_type == "task_subcomment") ) |> left_join(closed_relevance_summary, by = "TaskPHID") |> mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) output_df <- machine_result |> left_join(human_result, by = "TaskPHID") write.csv(machine_result, "100725_bivariate_data.csv", row.names = FALSE) # [ ] add sampling probabilities