1
0

updating with new olmo labels

This commit is contained in:
Matthew Gaughan 2025-12-07 10:15:14 -08:00
parent cec9d82d41
commit 108b8aacd6
9 changed files with 541920 additions and 9 deletions

BIN
120725_logit_dsl.RDS Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -31,10 +31,21 @@ desc_info <- main_df %>%
task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
)
old_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
old_df <- read.csv(old_csv, header = TRUE)
old_task_status <- old_df |>
filter(comment_type == "task_description") |>
select(TaskPHID, status)
new_desc_info <- desc_info |>
left_join(
old_task_status,
by= "TaskPHID"
)
#identifying comments in ADAC set
main_df <- main_df |>
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
left_join(desc_info, by = "TaskPHID") |>
left_join(new_desc_info, by = "TaskPHID") |>
mutate(
ADAC = as.integer(
!is.na(task_desc_author) &
@ -73,7 +84,7 @@ first_join <- main_df|>
by = "id"
)
olmo_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
olmo_csv <- "~/analysis_data/all_120525_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)
olmo_df <- olmo_df |>
@ -135,6 +146,13 @@ pulling <- unified_df |>
pulling <- unified_df |>
filter(id == "23366" | id == "20846" | id == "20847")
write.csv(unified_df, "110925_unified.csv", row.names = FALSE)
# [ x ] get the focal repo for gerrit code changes
unified_df <- unified_df |>
mutate(
gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+"),
task_status = status.y
)
write.csv(unified_df, "120725_unified.csv", row.names = FALSE)

Binary file not shown.

3236
dsl/120725_DSL_frame.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
library(tidyverse)
library(dsl)
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
@ -81,7 +81,7 @@ dev_model <- dsl(
data=dsl_df
)
summary(dev_model)
saveRDS(dev_model, "120225_logit_dsl.RDS")
saveRDS(dev_model, "120725_logit_dsl.RDS")
library(broom)
library(dplyr)

View File

@ -1,6 +1,6 @@
library(tidyverse)
unified_csv <-"~/analysis_data/110925_unified.csv"
unified_csv <-"~/analysis_data/120725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
# 1. aggregate to the task level
@ -220,7 +220,8 @@ task_level_variables <- unified_df |>
descriptions <- unified_df |>
filter(comment_type == "task_description")|>
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
source, phase, week_index, author_closer, resolution_outcome, priority )
source, phase, week_index, author_closer, resolution_outcome, priority,
gerrit_repo, task_status)
task_level_variables <- task_level_variables |>
left_join(
@ -290,4 +291,4 @@ ggplot(task_level_variables,
theme_minimal()
# 4. save
write.csv(task_level_variables, "111725_DSL_frame.csv", row.names = FALSE)
write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)

View File

@ -1,7 +1,7 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>