log1p the data

This commit is contained in:
mjgaughan 2024-04-24 12:59:07 -05:00
parent 67946e6e10
commit c97c24dd13
2 changed files with 516 additions and 514 deletions

File diff suppressed because it is too large Load Diff

View File

@ -43,13 +43,15 @@ windowed_data$week_offset <- windowed_data$week - 27
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
#find some EDA to identify which types of models might be the best for this #find some EDA to identify which types of models might be the best for this
mean(all_actions_data$count) hist(log(all_actions_data$count))
median(all_actions_data$count) median(all_actions_data$count)
table(all_actions_data$count) table(all_actions_data$count)
var(all_actions_data$count) var(all_actions_data$count)
qqnorm(all_actions_data$count) qqnorm(all_actions_data$count)
y <- qunif(ppoints(length(all_actions_data$count))) y <- qunif(ppoints(length(all_actions_data$count)))
qqplot(all_actions_data$count, y) qqplot(all_actions_data$count, y)
all_actions_data$logged_count <- log(all_actions_data$count)
all_actions_data$log1p_count <- log1p(all_actions_data$count)
# 3 rdd in lmer analysis # 3 rdd in lmer analysis
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
@ -67,8 +69,9 @@ windowed_sample_data <- expanded_sample_data |>
windowed_sample_data$scaled_project_age <- scale(windowed_sample_data$age_of_project) windowed_sample_data$scaled_project_age <- scale(windowed_sample_data$age_of_project)
windowed_sample_data$week_offset <- windowed_sample_data$week - 27 windowed_sample_data$week_offset <- windowed_sample_data$week - 27
all_actions_sample_data <- windowed_sample_data[which(windowed_sample_data$observation_type == "all"),] all_actions_sample_data <- windowed_sample_data[which(windowed_sample_data$observation_type == "all"),]
all_actions_sample_data$log1p_count <- log1p(all_actions_sample_data$count)
#test model #test model
test_model <- lmer(count ~ D * I(week_offset) + scaled_project_age + (D * I(week_offset)|upstream_vcs_link), data=all_actions_sample_data, REML=FALSE) test_model <- lmer(log1p_count ~ D * I(week_offset) + scaled_project_age + (D * I(week_offset)|upstream_vcs_link), data=all_actions_sample_data, REML=FALSE)
summary(test_model) summary(test_model)
#plot results #plot results
p <- ggplot(all_actions_sample_data, aes(x=week_offset, y=count, color=upstream_vcs_link), show.legend = FALSE) + p <- ggplot(all_actions_sample_data, aes(x=week_offset, y=count, color=upstream_vcs_link), show.legend = FALSE) +
@ -77,7 +80,7 @@ p <- ggplot(all_actions_sample_data, aes(x=week_offset, y=count, color=upstream_
theme_bw() theme_bw()
p p
##end of the model testing and plotting section ##end of the model testing and plotting section
all_model <- lmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE) all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE)
summary(all_model) summary(all_model)
all_residuals <- residuals(all_model) all_residuals <- residuals(all_model)
qqnorm(all_residuals) qqnorm(all_residuals)
@ -87,7 +90,6 @@ summary(mrg_model)
mrg_residuals <- residuals(mrg_model) mrg_residuals <- residuals(mrg_model)
qqnorm(mrg_residuals) qqnorm(mrg_residuals)
# Performance: # Performance:
library(merTools) library(merTools)
ICC(outcome="count", group="week", data=all_actions_data) ICC(outcome="count", group="week", data=all_actions_data)
#testing for different types of models #testing for different types of models