From 7bcf7ac4f8a24ec2c5b85b442293ee8c1d0f7d25 Mon Sep 17 00:00:00 2001 From: mjgaughan Date: Mon, 13 Nov 2023 10:52:40 -0600 Subject: [PATCH] expanding matching for data --- R/EDA.R | 2 +- R/calculatePower.R | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/EDA.R b/R/EDA.R index 667dca3..0c018c3 100644 --- a/R/EDA.R +++ b/R/EDA.R @@ -36,7 +36,7 @@ t.test(df1$up.fac.mean) # -0.1961401 -0.1647757 df$mmt <- (df$contributors + (2 * df$collaborators)) / (df$contributors + df$collaborators) df$old_mmt <- (df$contributors) / (df$contributors + df$collaborators) -t.test(df$mmt) +t.test(df$old_mmt) # 95 percent confidence interval: # 1.610638 1.684438 # \ No newline at end of file diff --git a/R/calculatePower.R b/R/calculatePower.R index 81dbd8d..29d3487 100644 --- a/R/calculatePower.R +++ b/R/calculatePower.R @@ -24,17 +24,20 @@ data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) data2 <- read_csv('../inst_all_packages_full_results.csv') #d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) #levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors") - -data1$up.fac.mean <- as.numeric(data2$up.fac.mean[match(data1$pkg, data2$pkg)]) -data1$milestones <- as.numeric(data1$milestones > 0) + 1 +python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) +same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) +data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) +data1$milestones <- as.numeric(data1$milestones > 0) # (2) - Run the model on the pilot data data1$formal.score <- data1$mmt / (data1$milestones/data1$age) table(data1$milestones) -hist(data1$mmt) #inequality of participation +hist(data1$old_mmt) #inequality of participation hist(data1$formal.score) hist(data1$age/365) kmodel1 <- lm(up.fac.mean ~ mmt, data=data1) summary(kmodel1) +kmodel1 <- lm(up.fac.mean ~ old_mmt, data=data1) +summary(kmodel1) kmodel1 <- lm(up.fac.mean ~ formal.score, data=data1) summary(kmodel1) hist(data1$formal.score) @@ -48,7 +51,7 @@ g <- ggplot(data1, aes(x=formal.score, y=up.fac.mean)) + geom_smooth() g -data2 <- subset(data1, (data1$age / 365) < 9 ) +data2 <- subset(data1, (data1$age / 365) < 14 ) hist(data2$age) g <- ggplot(data2, aes(x=formal.score, y=up.fac.mean)) + geom_point() +