pulling sample for human_labeling
This commit is contained in:
parent
83bcc15811
commit
a14b08cfd8
59686
dsl/100625_human_info_sample.csv
Normal file
59686
dsl/100625_human_info_sample.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,23 +1,45 @@
|
||||
library(tidyverse)
|
||||
|
||||
main_csv <-"~/analysis_data/092925_unified_phab.csv"
|
||||
main_csv <-"~/analysis_data/100325_unified_phab.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
set.seed(123) # For reproducibility
|
||||
set.seed(1871) # For reproducibility
|
||||
|
||||
sampled_df <- main_df %>%
|
||||
group_by(source) %>%
|
||||
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
|
||||
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 35)) %>%
|
||||
ungroup() %>%
|
||||
filter(sampled_TaskPHID) %>%
|
||||
select(-sampled_TaskPHID)
|
||||
|
||||
sentence_level_sample <- sampled_df |>
|
||||
mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
|
||||
labeling_sampled_df <- sampled_df %>%
|
||||
group_by(source) %>%
|
||||
mutate(
|
||||
verification_sample = if_else(
|
||||
TaskPHID %in% sample(unique(TaskPHID), min(7, length(unique(TaskPHID)))), 1L, 0L
|
||||
)
|
||||
) %>%
|
||||
ungroup()
|
||||
|
||||
sentence_level_sample <- labeling_sampled_df |>
|
||||
mutate(
|
||||
cleaned_sentences = lapply(
|
||||
olmo_cleaned_sentences,
|
||||
function(x) {
|
||||
m <- str_match_all(x, "'([^']*)'|\"([^\"]*)\"")[[1]]
|
||||
# Both columns 2 and 3 may exist, but only one will be filled for each match
|
||||
vals <- c(m[,2], m[,3])
|
||||
vals <- vals[vals != "" & !is.na(vals)]
|
||||
return(vals)
|
||||
}
|
||||
)
|
||||
)|>
|
||||
unnest(cleaned_sentences)|>
|
||||
filter(cleaned_sentences != ", ") |>
|
||||
select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
|
||||
select(-olmo_sentence_categories, -starts_with("normalized"), -contains("gerrit"))
|
||||
|
||||
|
||||
table(sentence_level_sample$verification_sample)
|
||||
(nrow(sentence_level_sample) / 293) * 1.5
|
||||
|
||||
#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)
|
||||
write.csv(sentence_level_sample, "100625_human_info_sample.csv", row.names = FALSE)
|
||||
|
||||
17
mgaughan-rstudio-server_29985545.out
Normal file
17
mgaughan-rstudio-server_29985545.out
Normal file
@ -0,0 +1,17 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:58519 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: Kyq6AGP0kRtXvXeoXReZ
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 29985545
|
||||
Loading…
Reference in New Issue
Block a user