cleaning some of the analysis files
This commit is contained in:
parent
47ac75bee9
commit
0bdd3ab6fa
@ -15,25 +15,22 @@ overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributor
|
|||||||
mean(overall_data$mmt)
|
mean(overall_data$mmt)
|
||||||
hist(overall_data$mmt, probability = TRUE)
|
hist(overall_data$mmt, probability = TRUE)
|
||||||
|
|
||||||
#age_vector <- overall_data$age_of_project/365
|
#the basic stuff for the overall data
|
||||||
#quantile(age_vector)
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
mean(overall_data$mmt)
|
||||||
table(overall_data$new.age)
|
hist(overall_data$mmt, probability = TRUE)
|
||||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
|
||||||
|
#some new variables around age
|
||||||
|
#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
#table(overall_data$new.age)
|
||||||
|
#overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||||
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
||||||
|
|
||||||
|
#model
|
||||||
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
||||||
summary(mmtmodel1)
|
summary(mmtmodel1)
|
||||||
qqnorm(residuals(mmtmodel1))
|
qqnorm(residuals(mmtmodel1))
|
||||||
|
|
||||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
|
||||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
|
||||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) +
|
|
||||||
xlab("MMT") +
|
|
||||||
ylab("Underproduction Factor") +
|
|
||||||
theme_bw() +
|
|
||||||
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
|
||||||
g4
|
|
||||||
#clean octo data
|
#clean octo data
|
||||||
octo_data <- filter(octo_data, total_contrib != 0)
|
octo_data <- filter(octo_data, total_contrib != 0)
|
||||||
# below this is the analysis for the octo data
|
# below this is the analysis for the octo data
|
||||||
@ -42,34 +39,30 @@ table(octo_data$new.age)
|
|||||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||||
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
||||||
|
|
||||||
length(which(octo_data$underproduction_low < 0))
|
|
||||||
median(octo_data$underproduction_mean)
|
|
||||||
|
|
||||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||||
mean(octo_data$mmt)
|
mean(octo_data$mmt)
|
||||||
hist(octo_data$mmt)
|
hist(octo_data$mmt)
|
||||||
head(octo_data)
|
head(octo_data)
|
||||||
|
|
||||||
|
#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
|
||||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
||||||
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
|
||||||
g2
|
|
||||||
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
|
||||||
g1
|
|
||||||
#right skewed data, need to transform
|
#right skewed data, need to transform
|
||||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
hist(octo_data$wiki_mmt)
|
hist(octo_data$wiki_mmt)
|
||||||
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
|
||||||
g3
|
#getting some of the information in about whether projects have specific files
|
||||||
median(octo_data$wiki_mmt)
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
||||||
qqnorm(octo_data$wiki_mmt)
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
||||||
#left skewed data, need to transform
|
octo_data <- octo_data |>
|
||||||
typeof(octo_data$wiki_contrib_count)
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
sum(octo_data$total_contrib == 0)
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
overall_data <- overall_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
|
||||||
|
|
||||||
#below are the models for the octo data, there should be analysis for each one
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
|
|
||||||
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
summary(octo_mmtmodel1)
|
summary(octo_mmtmodel1)
|
||||||
|
|
||||||
@ -81,6 +74,7 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme +
|
|||||||
summary(wiki_mmtmodel1)
|
summary(wiki_mmtmodel1)
|
||||||
qqnorm(residuals(wiki_mmtmodel1))
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
|
||||||
|
#these next three are looking at mmt as an outcome of other factors
|
||||||
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
|
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
|
||||||
summary(mmt_outcome_model)
|
summary(mmt_outcome_model)
|
||||||
|
|
||||||
@ -96,19 +90,13 @@ texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits
|
|||||||
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
||||||
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
||||||
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
#find the overlap between projects with octo data and projects with readmes or contributings
|
|
||||||
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
|
||||||
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
|
||||||
octo_data <- octo_data |>
|
|
||||||
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
||||||
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
||||||
overall_data <- overall_data |>
|
|
||||||
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
||||||
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
||||||
#below here is the analysis for the readme.md data
|
#below here is the analysis for the readme.md data
|
||||||
cor.test(octo_data$mmt, octo_data$has_readme)
|
cor.test(octo_data$mmt, octo_data$has_readme)
|
||||||
cor.test(octo_data$mmt, octo_data$has_contrib)
|
cor.test(octo_data$mmt, octo_data$has_contrib)
|
||||||
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
||||||
|
|
||||||
|
#using the groupings and estimates from the ranef coefficients from D as data
|
||||||
#pulling in the group data for the ranef coefficients
|
#pulling in the group data for the ranef coefficients
|
||||||
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
|
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
|
||||||
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
|
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
|
||||||
@ -124,7 +112,8 @@ rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE)
|
|||||||
contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
|
contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
|
||||||
grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
|
grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
|
||||||
grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
|
grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
|
||||||
#calculate in terms of July 6, 2020
|
# also looking at how long each project has had a specific governance document
|
||||||
|
# calculate in terms of July 6, 2020 (when underproduction metrics were collected)
|
||||||
library(chron)
|
library(chron)
|
||||||
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
|
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
|
||||||
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
|
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
|
||||||
@ -138,10 +127,7 @@ contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_da
|
|||||||
grouped_contrib <- grouped_contrib |>
|
grouped_contrib <- grouped_contrib |>
|
||||||
mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
||||||
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
||||||
#analyses
|
#test with linear model, there should be an interaction between how long the project has had a document and its grouping, no?
|
||||||
cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group)
|
|
||||||
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$factored_group)
|
|
||||||
#test with linear model
|
|
||||||
grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
|
grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
|
||||||
summary(grouping_model_rm)
|
summary(grouping_model_rm)
|
||||||
qqnorm(residuals(grouping_model_rm))
|
qqnorm(residuals(grouping_model_rm))
|
||||||
|
@ -34,25 +34,23 @@ window_num <- 8
|
|||||||
windowed_data <- expanded_data |>
|
windowed_data <- expanded_data |>
|
||||||
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
||||||
mutate(D = ifelse(week > 27, 1, 0))
|
mutate(D = ifelse(week > 27, 1, 0))
|
||||||
#scale the age numbers
|
#scale the age numbers and calculate the week offset here
|
||||||
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
||||||
windowed_data$week_offset <- windowed_data$week - 27
|
windowed_data$week_offset <- windowed_data$week - 27
|
||||||
#separate out the cleaning d
|
#break out the different type of commit actions
|
||||||
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
||||||
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
||||||
#EDA?
|
#logging
|
||||||
hist(log(all_actions_data$count))
|
|
||||||
all_actions_data$logged_count <- log(all_actions_data$count)
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
||||||
# now for merge
|
# now for merge
|
||||||
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
||||||
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
||||||
#TKTK ---------------------
|
|
||||||
#imports for models
|
#imports for models
|
||||||
library(lme4)
|
library(lme4)
|
||||||
library(optimx)
|
library(optimx)
|
||||||
library(lattice)
|
library(lattice)
|
||||||
#models -- TKTK need to be fixed
|
#model
|
||||||
all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link),
|
all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link),
|
||||||
control=glmerControl(optimizer="bobyqa",
|
control=glmerControl(optimizer="bobyqa",
|
||||||
optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data)
|
optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data)
|
||||||
@ -75,13 +73,7 @@ g <- test_glmer_ranef_D |>
|
|||||||
theme_bw()
|
theme_bw()
|
||||||
g
|
g
|
||||||
write.csv(test_glmer_ranef_D, "051224_contrib_grouped.csv")
|
write.csv(test_glmer_ranef_D, "051224_contrib_grouped.csv")
|
||||||
#d_effect_ranef_all <- all_model_ranef[all_model_ranef$term=="D",]
|
#NOTE: The merge action model below this has not been used but this is what it would be if it was
|
||||||
#d_effect_ranef_all$quartile <- ntile(d_effect_ranef_all$condval, 4)
|
|
||||||
#plotting ranefs
|
|
||||||
#model residuals
|
|
||||||
all_residuals <- residuals(all_model)
|
|
||||||
qqnorm(all_residuals)
|
|
||||||
# mrg behavior for this
|
|
||||||
mrg_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset | upstream_vcs_link), data=mrg_actions_data, REML=FALSE, control = lmerControl(
|
mrg_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset | upstream_vcs_link), data=mrg_actions_data, REML=FALSE, control = lmerControl(
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
||||||
summary(mrg_model)
|
summary(mrg_model)
|
||||||
|
@ -39,40 +39,30 @@ windowed_data <- expanded_data |>
|
|||||||
#scale the age numbers
|
#scale the age numbers
|
||||||
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
||||||
windowed_data$week_offset <- windowed_data$week - 27
|
windowed_data$week_offset <- windowed_data$week - 27
|
||||||
#separate out the cleaning d
|
#break out the different types of commit actions that are studied
|
||||||
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
||||||
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
||||||
#find some EDA to identify which types of models might be the best for this
|
#log the dependent
|
||||||
hist(log(all_actions_data$count))
|
|
||||||
median(all_actions_data$count)
|
|
||||||
table(all_actions_data$count)
|
|
||||||
var(all_actions_data$count)
|
|
||||||
qqnorm(all_actions_data$count)
|
|
||||||
y <- qunif(ppoints(length(all_actions_data$count)))
|
|
||||||
qqplot(all_actions_data$count, y)
|
|
||||||
all_actions_data$logged_count <- log(all_actions_data$count)
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
||||||
# 3 rdd in lmer analysis
|
# 3 rdd in lmer analysis
|
||||||
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
||||||
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
||||||
library(lme4)
|
|
||||||
# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar
|
# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar
|
||||||
|
library(lme4)
|
||||||
library(optimx)
|
library(optimx)
|
||||||
library(lattice)
|
library(lattice)
|
||||||
|
#some more EDA to go between Poisson and neg binomial
|
||||||
var(all_actions_data$log1p_count) # 1.125429
|
var(all_actions_data$log1p_count) # 1.125429
|
||||||
mean (all_actions_data$log1p_count) # 0.6426873
|
mean (all_actions_data$log1p_count) # 0.6426873
|
||||||
var(all_actions_data$count) # 268.4449
|
var(all_actions_data$count) # 268.4449
|
||||||
mean (all_actions_data$count) # 3.757298
|
mean (all_actions_data$count) # 3.757298
|
||||||
|
|
||||||
summary(all_actions_data$week_offset)
|
|
||||||
#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa",
|
#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa",
|
||||||
# optCtrl=list(maxfun=1e5)))
|
# optCtrl=list(maxfun=1e5)))
|
||||||
all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
|
all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
|
||||||
summary(all_log1p_gmodel)
|
summary(all_log1p_gmodel)
|
||||||
#warnings(all_log1p_gmodel)
|
|
||||||
#saveRDS(all_log1p_gmodel, "0510_log1p_nagq_gmodel_backup.rda")
|
#saveRDS(all_log1p_gmodel, "0510_log1p_nagq_gmodel_backup.rda")
|
||||||
#yesterdays_model <- readRDS("0510_rm_all.rda")
|
#I grouped the ranef D effects on 0512
|
||||||
all_residuals <- residuals(all_log1p_gmodel)
|
all_residuals <- residuals(all_log1p_gmodel)
|
||||||
qqnorm(all_residuals)
|
qqnorm(all_residuals)
|
||||||
library(broom.mixed)
|
library(broom.mixed)
|
||||||
@ -91,52 +81,9 @@ g <- test_glmer_ranef_D |>
|
|||||||
g
|
g
|
||||||
write.csv(test_glmer_ranef_D, "051224_readme_grouped.csv")
|
write.csv(test_glmer_ranef_D, "051224_readme_grouped.csv")
|
||||||
ggsave("0509caterpillar.png", g)
|
ggsave("0509caterpillar.png", g)
|
||||||
#below this groups the ranefs
|
# NOTE: below is the merge model for the same analysis, but it won't converge
|
||||||
"""
|
|
||||||
has_zero <- function(condval, condsd){
|
|
||||||
bounds <- condsd * 1.96
|
|
||||||
return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2))
|
|
||||||
}
|
|
||||||
df_ranefs <- df_ranefs |>
|
|
||||||
mutate(ranef_grouping = has_zero(condval, condsd)) |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == ),]
|
|
||||||
D_df_ranef <- D_df_ranef |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
hist(D_df_ranef$ranef_grouping)
|
|
||||||
#plot the ranefs
|
|
||||||
library(ggplot2)
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
theme_bw()
|
|
||||||
"""
|
|
||||||
#d_effect_ranef_all <- all_model_ranef$upstream_vcs_link
|
|
||||||
#d_effect_ranef_all$quartile <- ntile(d_effect_ranef_all$condval, 4)
|
|
||||||
#model residuals
|
|
||||||
all_residuals <- residuals(all_model)
|
|
||||||
qqnorm(all_residuals)
|
|
||||||
# mrg behavior for this
|
|
||||||
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
||||||
mrg_model <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link),
|
mrg_model <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + (D * week_offset | upstream_vcs_link),
|
||||||
control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), data=mrg_actions_data)
|
control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), data=mrg_actions_data)
|
||||||
summary(mrg_model)
|
summary(mrg_model)
|
||||||
saveRDS(mrg, "0510_rm_mrg.rda")
|
saveRDS(mrg, "0510_rm_mrg.rda")
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
mrg_model_ranef <- ranef(mrg_model, condVar=TRUE)
|
|
||||||
df_mrg_ranefs <- as.data.frame(mrg_model_ranef)
|
|
||||||
dotplot(mrg_model_ranef)
|
|
||||||
d_effect_ranef_mrg <- mrg_model_ranef[mrg_model_ranef$term=="D",]
|
|
||||||
d_effect_ranef_mrg$quartile <- ntile(d_effect_ranef_mrg$condval, 4)
|
|
||||||
#doing similar random effect analysis for this
|
|
||||||
df_mrg_ranefs <- df_mrg_ranefs |>
|
|
||||||
mutate(ranef_grouping = has_zero(condval, condsd)) |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_mrg_ranefs <- df_mrg_ranefs[which(df_mrg_ranefs$term == "D"),]
|
|
||||||
D_df_mrg_ranefs |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd)))
|
|
||||||
#merge model residuals
|
|
||||||
mrg_residuals <- residuals(mrg_model)
|
|
||||||
qqnorm(mrg_residuals)
|
|
||||||
# Performance:
|
|
@ -162,8 +162,6 @@ if __name__ == "__main__":
|
|||||||
print("Mean wordlength: ", mean(wordlengths))
|
print("Mean wordlength: ", mean(wordlengths))
|
||||||
print("Median wordlength: ", median(wordlengths))
|
print("Median wordlength: ", median(wordlengths))
|
||||||
lemmatized_corpus = preprocess(listed_corpus)
|
lemmatized_corpus = preprocess(listed_corpus)
|
||||||
#print(lemmatized_corpus)
|
|
||||||
#prepped_corpus, id2word = text_preparation(lemmatized_corpus)
|
|
||||||
'''
|
'''
|
||||||
vectorizer = CountVectorizer(analyzer='word',
|
vectorizer = CountVectorizer(analyzer='word',
|
||||||
min_df=2,
|
min_df=2,
|
||||||
@ -175,12 +173,7 @@ if __name__ == "__main__":
|
|||||||
'''
|
'''
|
||||||
vectorizer = joblib.load('readme_vectorizer.jl')
|
vectorizer = joblib.load('readme_vectorizer.jl')
|
||||||
data_vectorized = vectorizer.transform(lemmatized_corpus)
|
data_vectorized = vectorizer.transform(lemmatized_corpus)
|
||||||
#joblib.dump(vectorizer, 'readme_vectorizer.jl')
|
|
||||||
#print(data_vectorized)
|
|
||||||
#lda_model_identification(data_vectorized)
|
#lda_model_identification(data_vectorized)
|
||||||
#freqs = zip(vectorizer.get_feature_names_out(), data_vectorized.sum(axis=0).tolist()[0])
|
|
||||||
# sort from largest to smallest
|
|
||||||
#print(sorted(freqs, key=lambda x: -x[1])[:25])
|
|
||||||
#topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())
|
#topic_distributions = best_lda_model(data_vectorized, vectorizer.get_feature_names_out())
|
||||||
#get_most_prevalent(topic_distributions, file_list)
|
#get_most_prevalent(topic_distributions, file_list)
|
||||||
prevalent_topics(data_vectorized, file_list)
|
prevalent_topics(data_vectorized, file_list)
|
||||||
|
Loading…
Reference in New Issue
Block a user