updating with DSL power analysis
This commit is contained in:
parent
b7c2c9fcd6
commit
e61d3b6599
3996
dsl/092225_info_matt_labels.csv
Normal file
3996
dsl/092225_info_matt_labels.csv
Normal file
File diff suppressed because it is too large
Load Diff
3128
dsl/093025_power_dsl.csv
Normal file
3128
dsl/093025_power_dsl.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,59 @@
|
||||
library(tidyverse)
|
||||
|
||||
# load in the human labels and for each task filer, @ the task level
|
||||
# GET the proportion of Observed bug behavior + Expected Behavior
|
||||
# GET the proportion of Solution Discussion + Solution Usage
|
||||
human_csv <-"~/dsl/092225_info_matt_labels.csv"
|
||||
human_df <- read.csv(human_csv, header = TRUE)
|
||||
|
||||
#task_authors <- human_df %>%
|
||||
# filter(comment_type == "task_description") %>%
|
||||
# select(TaskPHID, AuthorPHID) %>%
|
||||
# rename(Task_AuthorPHID = AuthorPHID)
|
||||
|
||||
#result <- task_authors %>%
|
||||
# rowwise() %>%
|
||||
# mutate(
|
||||
# bug_prop = {
|
||||
# rows_by_author <- human_df %>% filter(AuthorPHID == task_authorPHID)
|
||||
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
|
||||
# }
|
||||
# ) %>%
|
||||
# ungroup()
|
||||
human_result <- human_df %>%
|
||||
group_by(TaskPHID) %>%
|
||||
summarise(
|
||||
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
||||
human_SOL_prop = mean(human_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"))
|
||||
)
|
||||
|
||||
# load in the existing df and @ the task level get
|
||||
# for each task filer,
|
||||
# GET the proportion of Observed bug behavior + Expected Behavior
|
||||
# GET the proportion of Solution Discussion + Solution Usage
|
||||
main_csv <-"~/analysis_data/092925_unified_phab.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
closed_relevance_summary <- main_df %>%
|
||||
filter(comment_type == "task_description") %>%
|
||||
select(TaskPHID, closed_relevance, priority_score, source, phase, week_index)
|
||||
|
||||
# TODO: need to get the TaskAuthor's comments, not just the big picture
|
||||
machine_result <- main_df %>%
|
||||
mutate(olmo_label = str_extract_all(olmo_sentence_categories, "(?<=')[^']+(?=')")) |>
|
||||
unnest(olmo_label) |>
|
||||
group_by(TaskPHID) |>
|
||||
summarise(
|
||||
olmo_BE_prop = mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
||||
olmo_SOL_prop = mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE")),
|
||||
median_gerrit_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||
n_comments = sum(comment_type == "task_subcomment")
|
||||
) |>
|
||||
left_join(closed_relevance_summary, by = "TaskPHID") |>
|
||||
mutate(dsl_score = ifelse(closed_relevance == "True", 1, 0))
|
||||
|
||||
output_df <- machine_result |>
|
||||
left_join(human_result, by = "TaskPHID")
|
||||
|
||||
write.csv(output_df, "093025_power_dsl.csv", row.names = FALSE)
|
||||
@ -0,0 +1,44 @@
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("naoki-egami/dsl", dependencies = TRUE)
|
||||
library(dsl)
|
||||
|
||||
power_csv <-"~/dsl/093025_power_dsl.csv"
|
||||
power_df <- read.csv(power_csv, header = TRUE)
|
||||
|
||||
power_c1 <- power_df |>
|
||||
filter(source=='c1')
|
||||
|
||||
power_c2 <- power_df |>
|
||||
filter(source=='c2')
|
||||
|
||||
power_c3 <- power_df |>
|
||||
filter(source=='c3')
|
||||
|
||||
power_model <- power_dsl(
|
||||
labeled_size = c(100, 200, 300, 600, 1000),
|
||||
model = "logit",
|
||||
formula = dsl_score ~ human_BE_prop +
|
||||
median_gerrit_delta + median_gerrit_reviewers +
|
||||
n_comments +
|
||||
priority_score +
|
||||
week_index,
|
||||
predicted_var = "human_BE_prop",
|
||||
prediction = "olmo_BE_prop",
|
||||
data=power_c1
|
||||
)
|
||||
|
||||
summary(power_model)
|
||||
plot(power_model, coef_name = "human_BE_prop")
|
||||
|
||||
trial_model <- dsl(
|
||||
model = "logit",
|
||||
formula = dsl_score ~ human_BE_prop +
|
||||
median_gerrit_delta + median_gerrit_reviewers +
|
||||
n_comments +
|
||||
priority_score +
|
||||
week_index,
|
||||
predicted_var = "human_BE_prop",
|
||||
prediction = "olmo_BE_prop",
|
||||
data=power_df
|
||||
)
|
||||
summary(trial_model)
|
||||
17
mgaughan-rstudio-server_29920945.out
Normal file
17
mgaughan-rstudio-server_29920945.out
Normal file
@ -0,0 +1,17 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: eSK3QbcwgGpUya1wJIvC
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 29920945
|
||||
Loading…
Reference in New Issue
Block a user