From 5bc4003a991ee2f3eb57e9bde1c131532c8752ff Mon Sep 17 00:00:00 2001 From: mgaughan Date: Thu, 24 Oct 2024 15:44:23 -0400 Subject: [PATCH] new branch for new sampling --- R/popRDDAnalyssis.R | 54 +++++++++++++++---------------------------- sample_good_subset.py | 21 +++++++++++++++++ 2 files changed, 39 insertions(+), 36 deletions(-) create mode 100644 sample_good_subset.py diff --git a/R/popRDDAnalyssis.R b/R/popRDDAnalyssis.R index 6f52d66..2dd3e74 100644 --- a/R/popRDDAnalyssis.R +++ b/R/popRDDAnalyssis.R @@ -3,12 +3,8 @@ library(plyr) library(stringr) try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) #load in data -full_df <- read_csv("../final_data/deb_full_data.csv") contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") -contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") -readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") -# age is calculated against December 11, 2023 #some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> @@ -32,9 +28,6 @@ expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) expanded_readme_data$logcount <- log(expanded_readme_data$count) expanded_contrib_data$logcount <- log(expanded_contrib_data$count) -#scale age -expanded_readme_data$scaled_age <- scale(expanded_readme_data$age_in_days) -expanded_contrib_data$scaled_age <- scale(expanded_contrib_data$age_in_days) #breaking out the types of population counts collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] @@ -43,39 +36,28 @@ contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_coll #import models library(lme4) library(optimx) -library(MASS) -#readme docs -simple_collab_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) -summary(simple_collab_readme_model) -qqnorm(residuals(simple_collab_readme_model)) -simple_contrib_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) -summary(simple_contrib_readme_model) -qqnorm(residuals(simple_contrib_readme_model)) -# I don't think MLM is the right one -collab_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=collab_pop_readme) +collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme) summary(collab_readme_model) -saveRDS(collab_readme_model, "final_models/0624_pop_rm_collab_better.rda") -contrib_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=contrib_pop_readme) +saveRDS(collab_readme_model, "0510_pop_rm_collab.rda") +crm_residuals <- residuals(collab_readme_model) +qqnorm(crm_residuals) +contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme) summary(contrib_readme_model) -saveRDS(contrib_readme_model, "final_models/0624_pop_rm_contrib.rda") -#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") -#contributing models are not statistically significant`` -library(texreg) - -texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, - custom.model.names=c( 'collab','contrib.' ), - custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'), - use.packages=FALSE, table=FALSE, ci.force = TRUE) +saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda") +conrm_residuals <- residuals(contrib_readme_model) +qqnorm(conrm_residuals) +collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib) +summary(collab_contrib_model) +saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda") +contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib) +summary(contrib_contrib_model) +saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda") library(ggplot2) -contrib_pop_readme |> - ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(after_doc))) + - geom_violin() - expanded_readme_data |> - ggplot(aes(x = after_doc, y = count, col = as.factor(after_doc))) + - geom_violin() + ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + + geom_point() + geom_jitter() expanded_contrib_data |> - ggplot(aes(x = after_doc, y = count, col = as.factor(after_doc))) + - geom_violin() + ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) + + geom_point() + geom_jitter() diff --git a/sample_good_subset.py b/sample_good_subset.py new file mode 100644 index 0000000..aeb4c3d --- /dev/null +++ b/sample_good_subset.py @@ -0,0 +1,21 @@ +import csv +import os +import pandas as pd + + +def for_readme_files(): + ld_csv_path = "final_data/deb_readme_did.csv" + ta_csv_path = "d_readability_readme.csv" + topic_csv_path = "text_analysis/readme_file_topic_distributions.csv" + # criteria for good readme + # longer than half of a pageview + + + +def for_contributing_files(): + ld_csv_path = "final_data/deb_contrib_did.csv" + ta_csv_path = "d_readability_contrib.csv" + topic_csv_path = "text_analysis/contrib_file_topic_distributions.csv" + # criteria for good contributing + # longer than half of a pageview +