updating with new human labels
This commit is contained in:
parent
b198781aa0
commit
90311ca136
1949
102125_human_info_sample.csv
Normal file
1949
102125_human_info_sample.csv
Normal file
File diff suppressed because it is too large
Load Diff
61006
analysis_data/102025_human_labels.csv
Normal file
61006
analysis_data/102025_human_labels.csv
Normal file
File diff suppressed because one or more lines are too long
49
analysis_data/sampling_strat_check.R
Normal file
49
analysis_data/sampling_strat_check.R
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(stringr)
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(purrr)
|
||||||
|
|
||||||
|
pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
||||||
|
pre_unified_df <- read.csv(pre_unified_csv, header = TRUE)
|
||||||
|
|
||||||
|
unified_task_df <- pre_unified_df |> filter(comment_type=="task_description")
|
||||||
|
unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description")
|
||||||
|
|
||||||
|
library(scales)
|
||||||
|
|
||||||
|
unified_task_df |>
|
||||||
|
count(source, phase, isAuthorWMF) |>
|
||||||
|
mutate(pct = round(n / sum(n), 3))
|
||||||
|
|
||||||
|
median_n <- unified_comment_df |>
|
||||||
|
count(AuthorPHID) |>
|
||||||
|
pull(n) |>
|
||||||
|
median(na.rm = TRUE)
|
||||||
|
median_n
|
||||||
|
|
||||||
|
median_comments <- pre_unified_df |>
|
||||||
|
count(TaskPHID) |>
|
||||||
|
pull(n) |>
|
||||||
|
median(na.rm=TRUE)
|
||||||
|
median_comments
|
||||||
|
|
||||||
|
human_csv <-"~/analysis_data/102025_human_labels.csv"
|
||||||
|
human_df <- read.csv(human_csv, header = TRUE)
|
||||||
|
|
||||||
|
cleaned_human_df <- human_df |>
|
||||||
|
mutate(human_label = replace_na(human_label, "NA"),
|
||||||
|
human_label = str_squish(human_label)) |>
|
||||||
|
group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |>
|
||||||
|
summarise(human_labels = list(human_label), .groups = "drop")
|
||||||
|
|
||||||
|
cleaned_human_df <- cleaned_human_df |>
|
||||||
|
left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
median_comments <- human_comment_df |>
|
||||||
|
count(AuthorPHID) |>
|
||||||
|
pull(n) |>
|
||||||
|
median(na.rm=TRUE)
|
||||||
|
median_comments
|
||||||
@ -1,27 +1,49 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
|
||||||
main_csv <-"~/analysis_data/100325_unified_phab.csv"
|
main_csv <-"~/analysis_data/100625_unified_w_affil.csv"
|
||||||
main_df <- read.csv(main_csv, header = TRUE)
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
human_csv <-"~/analysis_data/102025_human_labels.csv"
|
||||||
|
human_df <- read.csv(human_csv, header = TRUE)
|
||||||
|
selected_task_phids <- unique(human_df$TaskPHID)
|
||||||
|
|
||||||
set.seed(1893) # For reproducibility
|
set.seed(1893) # For reproducibility
|
||||||
|
|
||||||
sampled_df <- main_df %>%
|
# moment 1 of sampling
|
||||||
group_by(source) %>%
|
#sampled_df <- main_df %>%
|
||||||
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
|
# group_by(source) %>%
|
||||||
ungroup() %>%
|
# mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 40)) %>%
|
||||||
|
# ungroup() %>%
|
||||||
|
# filter(sampled_TaskPHID) %>%
|
||||||
|
# select(-sampled_TaskPHID)
|
||||||
|
|
||||||
|
# moment 2 of sampling
|
||||||
|
sampled_tasks <- main_df |>
|
||||||
|
filter(!(TaskPHID %in% selected_task_phids),
|
||||||
|
source=="c1",
|
||||||
|
phase=="3",
|
||||||
|
isAuthorWMF=="TRUE",
|
||||||
|
comment_type=="task_description")|>
|
||||||
|
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 10)) %>%
|
||||||
filter(sampled_TaskPHID) %>%
|
filter(sampled_TaskPHID) %>%
|
||||||
select(-sampled_TaskPHID)
|
select(-sampled_TaskPHID)
|
||||||
|
|
||||||
labeling_sampled_df <- sampled_df %>%
|
sampled_task_phids <- unique(sampled_tasks$TaskPHID)
|
||||||
group_by(source) %>%
|
|
||||||
mutate(
|
sampled_df <- main_df |>
|
||||||
verification_sample = if_else(
|
filter(TaskPHID %in% sampled_task_phids)
|
||||||
TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
|
|
||||||
)
|
#labeling_sampled_df <- sampled_df %>%
|
||||||
) %>%
|
# group_by(source) %>%
|
||||||
ungroup()
|
# mutate(
|
||||||
|
# verification_sample = if_else(
|
||||||
|
# TaskPHID %in% sample(unique(TaskPHID), min(8, length(unique(TaskPHID)))), 1L, 0L
|
||||||
|
# )
|
||||||
|
# ) %>%
|
||||||
|
# ungroup()
|
||||||
|
|
||||||
sentence_level_sample <- labeling_sampled_df |>
|
#sentence_level_sample <- labeling_sampled_df |>
|
||||||
|
sentence_level_sample <- sampled_df |>
|
||||||
mutate(
|
mutate(
|
||||||
cleaned_sentences = lapply(
|
cleaned_sentences = lapply(
|
||||||
olmo_cleaned_sentences,
|
olmo_cleaned_sentences,
|
||||||
@ -43,4 +65,4 @@ table(sentence_level_sample$verification_sample)
|
|||||||
(nrow(sentence_level_sample) / 293) * 1.5
|
(nrow(sentence_level_sample) / 293) * 1.5
|
||||||
length(unique(sentence_level_sample$TaskPHID))
|
length(unique(sentence_level_sample$TaskPHID))
|
||||||
|
|
||||||
write.csv(sentence_level_sample, "100625_human_info_sample.csv", row.names = FALSE)
|
write.csv(sentence_level_sample, "102125_human_info_sample.csv", row.names = FALSE)
|
||||||
|
|||||||
@ -1,17 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3439:57743 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: anx8V7R1X2rfcwUV20H/
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 30181212
|
|
||||||
Loading…
Reference in New Issue
Block a user