diff --git a/R/GovRiskPower.R b/R/GovRiskPower.R index 9b94c7d..1299ed8 100644 --- a/R/GovRiskPower.R +++ b/R/GovRiskPower.R @@ -15,25 +15,22 @@ overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributor mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) -#age_vector <- overall_data$age_of_project/365 -#quantile(age_vector) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) +#the basic stuff for the overall data +overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) +mean(overall_data$mmt) +hist(overall_data$mmt, probability = TRUE) + +#some new variables around age +#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +#table(overall_data$new.age) +#overall_data$new.age.factor <- as.factor(overall_data$new.age) overall_data$scaled_age <- scale(overall_data$age_of_project) +#model mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) summary(mmtmodel1) qqnorm(residuals(mmtmodel1)) -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + - geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) + - xlab("MMT") + - ylab("Underproduction Factor") + - theme_bw() + - theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top")) -g4 #clean octo data octo_data <- filter(octo_data, total_contrib != 0) # below this is the analysis for the octo data @@ -42,34 +39,30 @@ table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) octo_data$scaled_age <- scale(octo_data$age_of_project) -length(which(octo_data$underproduction_low < 0)) -median(octo_data$underproduction_mean) - octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) - +#getting the mmt-equivalent for both issue activity as well as wiki contrib activity octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt) -g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() -g2 -g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() -g1 #right skewed data, need to transform octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) -g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() -g3 -median(octo_data$wiki_mmt) -qqnorm(octo_data$wiki_mmt) -#left skewed data, need to transform -typeof(octo_data$wiki_contrib_count) -sum(octo_data$total_contrib == 0) + +#getting some of the information in about whether projects have specific files +readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) +contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) +octo_data <- octo_data |> + mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> + mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +overall_data <- overall_data |> + mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> + mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) + #below are the models for the octo data, there should be analysis for each one - octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(octo_mmtmodel1) @@ -81,6 +74,7 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + summary(wiki_mmtmodel1) qqnorm(residuals(wiki_mmtmodel1)) +#these next three are looking at mmt as an outcome of other factors mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) summary(mmt_outcome_model) @@ -96,19 +90,13 @@ texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), use.packages=FALSE, table=FALSE, ci.force = TRUE) -#find the overlap between projects with octo data and projects with readmes or contributings -readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) -contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) -octo_data <- octo_data |> - mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> - mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) -overall_data <- overall_data |> - mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> - mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) + #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$has_readme) cor.test(octo_data$mmt, octo_data$has_contrib) cor.test(octo_data$has_readme, octo_data$has_contrib) + +#using the groupings and estimates from the ranef coefficients from D as data #pulling in the group data for the ranef coefficients rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) @@ -124,7 +112,8 @@ rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE) contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE) grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link") grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link") -#calculate in terms of July 6, 2020 +# also looking at how long each project has had a specific governance document +# calculate in terms of July 6, 2020 (when underproduction metrics were collected) library(chron) dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) @@ -138,10 +127,7 @@ contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_da grouped_contrib <- grouped_contrib |> mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |> mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) -#analyses -cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group) -cor.test(grouped_contrib$underproduction_mean, grouped_contrib$factored_group) -#test with linear model +#test with linear model, there should be an interaction between how long the project has had a document and its grouping, no? grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm) summary(grouping_model_rm) qqnorm(residuals(grouping_model_rm)) diff --git a/R/contribRDDAnalysis.R b/R/contribRDDAnalysis.R index 40de148..0aa4d1d 100644 --- a/R/contribRDDAnalysis.R +++ b/R/contribRDDAnalysis.R @@ -34,25 +34,23 @@ window_num <- 8 windowed_data <- expanded_data |> filter(week >= (27 - window_num) & week <= (27 + window_num)) |> mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers +#scale the age numbers and calculate the week offset here windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) windowed_data$week_offset <- windowed_data$week - 27 -#separate out the cleaning d +#break out the different type of commit actions all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#EDA? -hist(log(all_actions_data$count)) +#logging all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) # now for merge mrg_actions_data$logged_count <- log(mrg_actions_data$count) mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) -#TKTK --------------------- #imports for models library(lme4) library(optimx) library(lattice) -#models -- TKTK need to be fixed +#model all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) @@ -75,13 +73,7 @@ g <- test_glmer_ranef_D |> theme_bw() g write.csv(test_glmer_ranef_D, "051224_contrib_grouped.csv") -#d_effect_ranef_all <- all_model_ranef[all_model_ranef$term=="D",] -#d_effect_ranef_all$quartile <- ntile(d_effect_ranef_all$condval, 4) -#plotting ranefs -#model residuals -all_residuals <- residuals(all_model) -qqnorm(all_residuals) -# mrg behavior for this +#NOTE: The merge action model below this has not been used but this is what it would be if it was mrg_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset | upstream_vcs_link), data=mrg_actions_data, REML=FALSE, control = lmerControl( optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) summary(mrg_model) diff --git a/R/readmeRDDAnalysis.R b/R/readmeRDDAnalysis.R index 7caf60e..3a4d644 100644 --- a/R/readmeRDDAnalysis.R +++ b/R/readmeRDDAnalysis.R @@ -39,40 +39,30 @@ windowed_data <- expanded_data |> #scale the age numbers windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) windowed_data$week_offset <- windowed_data$week - 27 -#separate out the cleaning d +#break out the different types of commit actions that are studied all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#find some EDA to identify which types of models might be the best for this -hist(log(all_actions_data$count)) -median(all_actions_data$count) -table(all_actions_data$count) -var(all_actions_data$count) -qqnorm(all_actions_data$count) -y <- qunif(ppoints(length(all_actions_data$count))) -qqplot(all_actions_data$count, y) +#log the dependent all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) # 3 rdd in lmer analysis # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc -library(lme4) # https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar +library(lme4) library(optimx) library(lattice) - +#some more EDA to go between Poisson and neg binomial var(all_actions_data$log1p_count) # 1.125429 mean (all_actions_data$log1p_count) # 0.6426873 var(all_actions_data$count) # 268.4449 mean (all_actions_data$count) # 3.757298 - -summary(all_actions_data$week_offset) #all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=1e5))) all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda") summary(all_log1p_gmodel) -#warnings(all_log1p_gmodel) #saveRDS(all_log1p_gmodel, "0510_log1p_nagq_gmodel_backup.rda") -#yesterdays_model <- readRDS("0510_rm_all.rda") +#I grouped the ranef D effects on 0512 all_residuals <- residuals(all_log1p_gmodel) qqnorm(all_residuals) library(broom.mixed) @@ -91,52 +81,9 @@ g <- test_glmer_ranef_D |> g write.csv(test_glmer_ranef_D, "051224_readme_grouped.csv") ggsave("0509caterpillar.png", g) -#below this groups the ranefs -""" -has_zero <- function(condval, condsd){ - bounds <- condsd * 1.96 - return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2)) -} -df_ranefs <- df_ranefs |> - mutate(ranef_grouping = has_zero(condval, condsd)) |> - mutate(rank = rank(condval)) -D_df_ranef <- df_ranefs[which(df_ranefs$term == ),] -D_df_ranef <- D_df_ranef |> - mutate(rank = rank(condval)) -hist(D_df_ranef$ranef_grouping) -#plot the ranefs -library(ggplot2) -D_df_ranef |> - ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + - geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + - theme_bw() -""" -#d_effect_ranef_all <- all_model_ranef$upstream_vcs_link -#d_effect_ranef_all$quartile <- ntile(d_effect_ranef_all$condval, 4) -#model residuals -all_residuals <- residuals(all_model) -qqnorm(all_residuals) -# mrg behavior for this +# NOTE: below is the merge model for the same analysis, but it won't converge mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) mrg_model <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), data=mrg_actions_data) summary(mrg_model) -saveRDS(mrg, "0510_rm_mrg.rda") -#identifying the quartiles of effect for D -mrg_model_ranef <- ranef(mrg_model, condVar=TRUE) -df_mrg_ranefs <- as.data.frame(mrg_model_ranef) -dotplot(mrg_model_ranef) -d_effect_ranef_mrg <- mrg_model_ranef[mrg_model_ranef$term=="D",] -d_effect_ranef_mrg$quartile <- ntile(d_effect_ranef_mrg$condval, 4) -#doing similar random effect analysis for this -df_mrg_ranefs <- df_mrg_ranefs |> - mutate(ranef_grouping = has_zero(condval, condsd)) |> - mutate(rank = rank(condval)) -D_df_mrg_ranefs <- df_mrg_ranefs[which(df_mrg_ranefs$term == "D"),] -D_df_mrg_ranefs |> - ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + - geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) -#merge model residuals -mrg_residuals <- residuals(mrg_model) -qqnorm(mrg_residuals) -# Performance: \ No newline at end of file +saveRDS(mrg, "0510_rm_mrg.rda") \ No newline at end of file diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index abb682f..f893c45 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -162,8 +162,6 @@ if __name__ == "__main__": print("Mean wordlength: ", mean(wordlengths)) print("Median wordlength: ", median(wordlengths)) lemmatized_corpus = preprocess(listed_corpus) - #print(lemmatized_corpus) - #prepped_corpus, id2word = text_preparation(lemmatized_corpus) ''' vectorizer = CountVectorizer(analyzer='word', min_df=2, @@ -174,13 +172,8 @@ if __name__ == "__main__": data_vectorized = vectorizer.fit_transform(lemmatized_corpus) ''' vectorizer = joblib.load('readme_vectorizer.jl') - data_vectorized = vectorizer.transform(lemmatized_corpus) - #joblib.dump(vectorizer, 'readme_vectorizer.jl') - #print(data_vectorized) + data_vectorized = vectorizer.transform(lemmatized_corpus) #lda_model_identification(data_vectorized) - #freqs = zip(vectorizer.get_feature_names_out(), data_vectorized.sum(axis=0).tolist()[0]) - # sort from largest to smallest - #print(sorted(freqs, key=lambda x: -x[1])[:25]) #topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out()) #get_most_prevalent(topic_distributions, file_list) prevalent_topics(data_vectorized, file_list)