updating for new bivariate plots
This commit is contained in:
		
							parent
							
								
									2efd961fed
								
							
						
					
					
						commit
						6f2858dd72
					
				@ -15,7 +15,7 @@ library(ggdist)
 | 
			
		||||
 | 
			
		||||
ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) +
 | 
			
		||||
  facet_grid(~source) +
 | 
			
		||||
  geom_dots(side = "both", layout = "hex", stackratio = 0.92) +
 | 
			
		||||
  geom_dots(side=  "both", layout = "hex", stackratio = 0.92) +
 | 
			
		||||
  scale_fill_viridis_d() + 
 | 
			
		||||
  xlim(-130, 15) + 
 | 
			
		||||
  theme_minimal() +
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,19 @@ library(purrr)
 | 
			
		||||
unified_csv <-"~/analysis_data/102725_unified.csv"
 | 
			
		||||
unified_df <- read.csv(unified_csv, header = TRUE) 
 | 
			
		||||
 | 
			
		||||
unified_df <- unified_df |> mutate(has_update = !is.na(olmo_sentence_labels) &
 | 
			
		||||
                                     grepl("(ACTION ON ISSUE|TASK PROGRESS)", as.character(olmo_sentence_labels),
 | 
			
		||||
                                           ignore.case = TRUE),
 | 
			
		||||
                                   has_update = ifelse(
 | 
			
		||||
                                     has_update, 1, 0
 | 
			
		||||
                                   ))
 | 
			
		||||
 | 
			
		||||
model <- glm(has_update ~ PC1 + PC2 + PC3 + PC4 + modal_verbs,
 | 
			
		||||
             family = binomial(link = "logit"),
 | 
			
		||||
             data = unified_df)
 | 
			
		||||
summary(model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
unified_df |>
 | 
			
		||||
  ggplot(
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										56
									
								
								dsl/dsl.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								dsl/dsl.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,56 @@
 | 
			
		||||
library(tidyverse)
 | 
			
		||||
library(dsl)
 | 
			
		||||
 | 
			
		||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
 | 
			
		||||
dsl_df <- read.csv(dsl_csv, header = TRUE) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
power_model <- power_dsl(
 | 
			
		||||
  labeled_size = c(100, 200, 300, 600, 1000),
 | 
			
		||||
  model = "logit", 
 | 
			
		||||
  formula = dsl_score ~ human_SOL_prop_adac + 
 | 
			
		||||
    median_gerrit_loc_delta + median_gerrit_reviewers + 
 | 
			
		||||
    as.factor(isAuthorWMF) + 
 | 
			
		||||
    as.factor(source) +
 | 
			
		||||
    median_PC3_adac +
 | 
			
		||||
    week_index,
 | 
			
		||||
  predicted_var = "human_SOL_prop_adac",
 | 
			
		||||
  prediction = "olmo_SOL_prop_adac",
 | 
			
		||||
  sample_prob = "sampling_prob",
 | 
			
		||||
  data=dsl_df
 | 
			
		||||
)
 | 
			
		||||
summary(power_model)
 | 
			
		||||
plot(power_model, coef_name = "human_SOL_prop_adac")
 | 
			
		||||
 | 
			
		||||
dsl_df <- dsl_df |>
 | 
			
		||||
  filter(source=="c1")
 | 
			
		||||
 | 
			
		||||
trial_model <- dsl(
 | 
			
		||||
  model = "logit", 
 | 
			
		||||
  formula = dsl_score ~ human_BI_prop_adac + 
 | 
			
		||||
    median_gerrit_loc_delta + median_gerrit_reviewers + 
 | 
			
		||||
    as.factor(isAuthorWMF) + 
 | 
			
		||||
    as.factor(author_closer) + 
 | 
			
		||||
    median_PC4_adac +
 | 
			
		||||
    week_index,
 | 
			
		||||
  predicted_var = "human_BI_prop_adac",
 | 
			
		||||
  prediction = "olmo_BI_prop_adac",
 | 
			
		||||
  sample_prob = "sampling_prob",
 | 
			
		||||
  data=dsl_df
 | 
			
		||||
)
 | 
			
		||||
summary(trial_model)
 | 
			
		||||
 | 
			
		||||
style_model <- dsl(
 | 
			
		||||
  model = "lm", 
 | 
			
		||||
  formula = human_BE_prop ~ 
 | 
			
		||||
    median_PC1 + median_PC4 + 
 | 
			
		||||
    as.factor(isAuthorWMF) + 
 | 
			
		||||
    as.factor(author_closer) + 
 | 
			
		||||
    median_PC3 +
 | 
			
		||||
    week_index,
 | 
			
		||||
  predicted_var = "human_BE_prop",
 | 
			
		||||
  prediction = "olmo_BE_prop",
 | 
			
		||||
  sample_prob = "sampling_prob",
 | 
			
		||||
  data=dsl_df
 | 
			
		||||
)
 | 
			
		||||
summary(style_model)
 | 
			
		||||
							
								
								
									
										45
									
								
								dsl/final_bivariate.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								dsl/final_bivariate.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,45 @@
 | 
			
		||||
library(tidyverse)
 | 
			
		||||
#library(dsl)
 | 
			
		||||
library(dplyr)
 | 
			
		||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
 | 
			
		||||
dsl_df <- read.csv(dsl_csv, header = TRUE) 
 | 
			
		||||
 | 
			
		||||
outcome_summary <- dsl_df |>
 | 
			
		||||
  group_by(source, isAuthorWMF)|>
 | 
			
		||||
  summarise(
 | 
			
		||||
    total_sum = sum(!is.na(resolution_outcome)),
 | 
			
		||||
    count_resolution_outcome = sum(resolution_outcome),
 | 
			
		||||
    success_prop = count_resolution_outcome / total_sum,
 | 
			
		||||
    median_ttr_days = median(TTR, na.rm = TRUE) / 24
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
library(ggplot2)
 | 
			
		||||
library(ggdist)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
signed_power <- function(x, p) {
 | 
			
		||||
  sign(x) * abs(x) ^ p
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
signed_log <- function(x) sign(x) * log1p(abs(x))  
 | 
			
		||||
dsl_df <- dsl_df |>
 | 
			
		||||
  mutate(
 | 
			
		||||
    sp_med_pc3_adac = signed_power(median_PC3_adac, 0.2),
 | 
			
		||||
    sp_med_pc4_adac = signed_power(median_PC4_adac, 0.2),
 | 
			
		||||
    sl_med_pc4_adac = signed_log(median_PC4_adac),
 | 
			
		||||
    sl_med_pc3_adac = signed_log(median_PC3_adac)
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ggplot(dsl_df, aes(
 | 
			
		||||
  y= log1p(TTR/24),
 | 
			
		||||
  x=sl_med_pc4_adac,
 | 
			
		||||
  shape=isAuthorWMF,
 | 
			
		||||
  color=isAuthorWMF
 | 
			
		||||
                   )) +
 | 
			
		||||
  facet_grid(~source) +
 | 
			
		||||
  theme_minimal() + 
 | 
			
		||||
  geom_smooth(method="loess", span=0.5) + 
 | 
			
		||||
  geom_point() + 
 | 
			
		||||
  scale_color_viridis_d() 
 | 
			
		||||
@ -3,8 +3,6 @@ library(tidyverse)
 | 
			
		||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
 | 
			
		||||
dsl_df <- read.csv(dsl_csv, header = TRUE) 
 | 
			
		||||
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
 | 
			
		||||
dsl_df <- dsl_df |>
 | 
			
		||||
  filter(source == "c1")
 | 
			
		||||
 | 
			
		||||
library(survival)
 | 
			
		||||
library(broom)
 | 
			
		||||
@ -12,8 +10,8 @@ dsl_df$ttr_weeks <- dsl_df$TTR / 168
 | 
			
		||||
trial.survival <- Surv(dsl_df$ttr_weeks)
 | 
			
		||||
trial.model <- coxph(trial.survival ~ isAuthorWMF + 
 | 
			
		||||
                       median_PC3_adac + week_index + 
 | 
			
		||||
                       median_gerrit_loc_delta + median_gerrit_reviewers +
 | 
			
		||||
                       olmo_BI_prop_adac, data=dsl_df)
 | 
			
		||||
                       median_gerrit_loc_delta + median_gerrit_reviewers + source +
 | 
			
		||||
                     phase + author_closer, data=dsl_df)
 | 
			
		||||
summary(trial.model)
 | 
			
		||||
trial.tab <- tidy(trial.model,  exponentiate=T, conf.int=T)
 | 
			
		||||
 | 
			
		||||
@ -26,3 +24,5 @@ ggplot(trial.tab,
 | 
			
		||||
 | 
			
		||||
surv.at.means <- survfit(trial.model)
 | 
			
		||||
plot(surv.at.means, xlab="weeks", ylab="survival probability")
 | 
			
		||||
#https://hbiostat.org/stat/binarysurv
 | 
			
		||||
plot(surv.at.means, xlab = "weeks", ylab = "survival probability", xlim = c(0, 26))
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										17
									
								
								mgaughan-rstudio-server_30651103.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								mgaughan-rstudio-server_30651103.out
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,17 @@
 | 
			
		||||
1. SSH tunnel from your workstation using the following command:
 | 
			
		||||
 | 
			
		||||
   ssh -N -L 8787:n3439:51687 mjilg@klone.hyak.uw.edu
 | 
			
		||||
 | 
			
		||||
   and point your web browser to http://localhost:8787
 | 
			
		||||
 | 
			
		||||
2. log in to RStudio Server using the following credentials:
 | 
			
		||||
 | 
			
		||||
   user: mjilg
 | 
			
		||||
   password: JZoMumQwFbIc7EUcaGbK
 | 
			
		||||
 | 
			
		||||
When done using RStudio Server, terminate the job by:
 | 
			
		||||
 | 
			
		||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 | 
			
		||||
2. Issue the following command on the login node:
 | 
			
		||||
 | 
			
		||||
      scancel -f 30651103
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user