library(dplyr) df<-read.csv('../power_data_111023_mmt.csv') df1 <- read.csv('../inst_all_packages_full_results.csv') hist(df$age/365) #there's a big bump at 9 years, why? hist(df$contributors) #skewed hist(log(df$contributors)) #better hist(df$collaborators) #skewed hist(log(df$collaborators)) #better hist(df$milestones) #one of many ways to generate a dichotomous variable df$uses_milestones <- case_when(df$milestones != 0 ~ TRUE, .default = FALSE) table(df$uses_milestones) #playing around cor.test(df$contributors, as.numeric(df$uses_milestones)) cor.test(df$collaborators, as.numeric(df$uses_milestones)) cor.test(df$contributors / df$collaborators, df$age / 365) t.test(df$age) #95 percent confidence interval: # 2793.638 3066.417 t.test(df$contributors) #95 percent confidence interval: # 27.6519 154.6866 t.test(df$collaborators) #95 percent confidence interval: # 50.01884 96.77090 t.test(df1$up.fac.mean) #95 percent confidence interval: # -0.1961401 -0.1647757 df$mmt <- (df$contributors + (2 * df$collaborators)) / (df$contributors + df$collaborators) df$old_mmt <- (df$contributors) / (df$contributors + df$collaborators) t.test(df$old_mmt) # 95 percent confidence interval: # 1.610638 1.684438 #