updating with new olmo labels

2025-12-07 10:15:14 -08:00 · 2025-12-07 10:15:14 -08:00 · 108b8aacd6
commit 108b8aacd6
parent cec9d82d41
9 changed files with 541920 additions and 9 deletions
--- a/120725_logit_dsl.RDS
+++ b/120725_logit_dsl.RDS
--- a/analysis_data/120725_unified.csv
+++ b/analysis_data/120725_unified.csv
--- a/analysis_data/all_120525_olmo_batched_categorized.csv
+++ b/analysis_data/all_120525_olmo_batched_categorized.csv
--- a/analysis_data/data_verification_3.R
+++ b/analysis_data/data_verification_3.R
@ -31,10 +31,21 @@ desc_info <- main_df %>%
    task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
  )

+old_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
+old_df <- read.csv(old_csv, header = TRUE) 
+old_task_status <- old_df |>
+  filter(comment_type == "task_description") |>
+  select(TaskPHID, status)
+
+new_desc_info <- desc_info |>
+  left_join(
+    old_task_status,
+    by= "TaskPHID"
+  )
 #identifying comments in ADAC set
 main_df <- main_df |>
  mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
-  left_join(desc_info, by = "TaskPHID") |>
+  left_join(new_desc_info, by = "TaskPHID") |>
  mutate(
    ADAC = as.integer(
      !is.na(task_desc_author) &
@ -73,7 +84,7 @@ first_join <- main_df|>
    by = "id"
  )

-olmo_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
+olmo_csv <- "~/analysis_data/all_120525_olmo_batched_categorized.csv"
 olmo_df <- read.csv(olmo_csv, header = TRUE) 

 olmo_df <- olmo_df |>
@ -135,6 +146,13 @@ pulling <- unified_df |>
 pulling <- unified_df |>
  filter(id == "23366" | id == "20846" | id == "20847")

-write.csv(unified_df, "110925_unified.csv", row.names = FALSE)
+# [ x ] get the focal repo for gerrit code changes
+unified_df <- unified_df |> 
+  mutate(
+    gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+"),
+    task_status = status.y
+  )
+  
+write.csv(unified_df, "120725_unified.csv", row.names = FALSE)


--- a/dsl/120225_logit_dsl.RDS
+++ b/dsl/120225_logit_dsl.RDS
--- a/dsl/120725_DSL_frame.csv
+++ b/dsl/120725_DSL_frame.csv
--- a/dsl/dsl.R
+++ b/dsl/dsl.R
@ -1,7 +1,7 @@
 library(tidyverse)
 library(dsl)

-dsl_csv <-"~/dsl/111725_DSL_frame.csv"
+dsl_csv <-"~/dsl/120725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

 dsl_df <- dsl_df |>
@ -81,7 +81,7 @@ dev_model <- dsl(
  data=dsl_df
 )
 summary(dev_model)
-saveRDS(dev_model, "120225_logit_dsl.RDS")
+saveRDS(dev_model, "120725_logit_dsl.RDS")

 library(broom)
 library(dplyr)
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,6 +1,6 @@
 library(tidyverse)

-unified_csv <-"~/analysis_data/110925_unified.csv"
+unified_csv <-"~/analysis_data/120725_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 

 # 1. aggregate to the task level 
@ -220,7 +220,8 @@ task_level_variables <- unified_df |>
 descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
-         source, phase, week_index, author_closer, resolution_outcome, priority )
+         source, phase, week_index, author_closer, resolution_outcome, priority,
+         gerrit_repo, task_status)

 task_level_variables <- task_level_variables |>
  left_join(
@ -290,4 +291,4 @@ ggplot(task_level_variables,
  theme_minimal()

 # 4. save
-write.csv(task_level_variables, "111725_DSL_frame.csv", row.names = FALSE)
+write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)
--- a/dsl/final_bivariate.R
+++ b/dsl/final_bivariate.R
@ -1,7 +1,7 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/111725_DSL_frame.csv"
+dsl_csv <-"~/dsl/120725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

 dsl_df <- dsl_df |>