60 lines
2.4 KiB
R
60 lines
2.4 KiB
R
library(tidyverse)
|
|
|
|
# load in the human labels and for each task filer, @ the task level
|
|
# GET the proportion of Observed bug behavior + Expected Behavior
|
|
# GET the proportion of Solution Discussion + Solution Usage
|
|
human_csv <-"~/dsl/092225_info_matt_labels.csv"
|
|
human_df <- read.csv(human_csv, header = TRUE)
|
|
|
|
#task_authors <- human_df %>%
|
|
# filter(comment_type == "task_description") %>%
|
|
# select(TaskPHID, AuthorPHID) %>%
|
|
# rename(Task_AuthorPHID = AuthorPHID)
|
|
|
|
#result <- task_authors %>%
|
|
# rowwise() %>%
|
|
# mutate(
|
|
# bug_prop = {
|
|
# rows_by_author <- human_df %>% filter(AuthorPHID == task_authorPHID)
|
|
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
|
|
# }
|
|
# ) %>%
|
|
# ungroup()
|
|
human_result <- human_df %>%
|
|
group_by(TaskPHID) %>%
|
|
summarise(
|
|
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
|
human_SOL_prop = mean(human_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"))
|
|
)
|
|
|
|
# load in the existing df and @ the task level get
|
|
# for each task filer,
|
|
# GET the proportion of Observed bug behavior + Expected Behavior
|
|
# GET the proportion of Solution Discussion + Solution Usage
|
|
main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
closed_relevance_summary <- main_df %>%
|
|
filter(comment_type == "task_description") %>%
|
|
select(TaskPHID, resolution_outcome, priority_score, priority, source, phase, week_index, isAuthorWMF)
|
|
|
|
# TODO: need to get the TaskAuthor's comments, not just the big picture
|
|
machine_result <- main_df %>%
|
|
mutate(olmo_label = str_extract_all(olmo_sentence_categories, "(?<=')[^']+(?=')")) |>
|
|
unnest(olmo_label) |>
|
|
group_by(TaskPHID) |>
|
|
summarise(
|
|
olmo_BE_prop = mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
|
olmo_SOL_prop = mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE")),
|
|
median_gerrit_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
|
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
|
n_comments = sum(comment_type == "task_subcomment")
|
|
) |>
|
|
left_join(closed_relevance_summary, by = "TaskPHID") |>
|
|
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0))
|
|
|
|
output_df <- machine_result |>
|
|
left_join(human_result, by = "TaskPHID")
|
|
|
|
write.csv(machine_result, "100725_bivariate_data.csv", row.names = FALSE)
|