various R activity

This commit is contained in:
mjgaughan 2024-01-28 16:26:41 -06:00
parent ad719470e6
commit 3c7eacad64
17 changed files with 682 additions and 512 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
R/.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

BIN
R/0119-final-mmt.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

BIN
R/630_0119_final.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

View File

@ -34,8 +34,9 @@ data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age) data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
table(data1$formal.score) table(data1$formal.score)
hist(data1$old_mmt, prob=TRUE) #inequality of participation hist(data1$old_mmt, prob=TRUE) #inequality of participation
hist(data1$formal.score) median(data1$contributors)
hist(data1$age/365) median(data1$collaborators)
median(data1$age/365)
data1$new_mmt <- data1$mmt - 1 data1$new_mmt <- data1$mmt - 1
hist(data1$new_mmt, prob=TRUE) hist(data1$new_mmt, prob=TRUE)
@ -67,15 +68,35 @@ cor.test(data1$mmt, data1$up.fac.mean)
cor.test(data1$milestones, data1$up.fac.mean) cor.test(data1$milestones, data1$up.fac.mean)
cor.test(data1$age, data1$up.fac.mean) cor.test(data1$age, data1$up.fac.mean)
data1$new.age.factor <- as.factor(data1$new.age)
#geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange")+
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() + geom_point() +
geom_smooth() + #geom_smooth( method="lm", formula=(y~x), colour = "orange")+
geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange", size=1)+
geom_errorbar(aes(ymin=y-yerr, ymax=y+yerr), width=0.09)+
xlab("MMT") + xlab("MMT") +
ylab("Underproduction Factor") + ylab("Underproduction Factor") +
theme_bw() theme_bw()
g g
g g
colors_legend <- c("a"="#E69F00","b"="#56B4E9", "c"="#D55E00","d"="#CC79A7")
#colors_legend <- c("0-9y"="red","9-12y"="green", "12-15y"="blue","15-16y"="orange")
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
geom_abline(aes(intercept=1.65, slope=-1.38, color="a"), size=1.5)+
geom_abline(aes(intercept=1.72, slope=-1.38, color="b"), size=1.5)+
geom_abline(aes(intercept=2.25, slope=-1.38, color="c" ), size=1.5)+
geom_abline(aes(intercept=2.8, slope=-1.38, color="d") , size=1.5)+
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
theme_bw()+
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g
data2 <- subset(data1, (data1$age / 365) < 14 ) data2 <- subset(data1, (data1$age / 365) < 14 )
hist(floor(data2$age)) hist(floor(data2$age))
g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) + g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) +

BIN
R/data_subset_agegroup.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

BIN
R/final-mmt-plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

143
R/newAnalysis.R Normal file
View File

@ -0,0 +1,143 @@
rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2)
data1 <- read_csv('../kk_final_expanded_data_final.csv',show_col_types = FALSE)
data2 <- read_csv('../kk_final_octo_data_total.csv',show_col_types = FALSE)
data3 <- read_csv('../kk_final_doclist_roster.csv',show_col_types = FALSE)
data4 <-read_csv('../kk_final_rosterslist.csv',show_col_types = FALSE)
#getting data subset metadata
head(data1)
head(data2)
head(data3)
head(data4)
length(which(data2$underproduction_low < 0))
mean(data2$underproduction_mean)
length(which(data1$underproduction_low < 0))
mean(data1$underproduction_mean)
length(which(data3$underproduction_low < 0))
mean(data3$underproduction_mean)
length(which(data4$underproduction_low < 0))
mean(data4$underproduction_mean)
data1$mmt <- (((data1$collaborators * 2)+ data1$contributors) / (data1$contributors + data1$collaborators)) - 1
mean(data1$mmt)
hist(data1$mmt, probability = TRUE)
data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data1$new.age)
data1$new.age.factor <- as.factor(data1$new.age)
hist(data1$new.age)
age1 <- c(0.39369, 0.239271, 0.2096806, 0.1573584)
d1label <- rep("Overall", length(data1$new.age.factor))
d1per <- data1$new.age
d1per[d1per==1] <- 39.37
d1per[d1per==2] <- 23.93
d1per[d1per==3] <- 20.97
d1per[d1per==4] <- 15.74
d1per.factor<- as.factor(d1per)
data5 <- (d1label)
data2$new.age <- as.numeric(cut(data2$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data2$new.age)
data2$new.age.factor <- as.factor(data2$new.age)
hist(data2$new.age)
age2 <- c(0.5675676, 0.1981982, 0.1681682, 0.06606607)
d2label <- rep("Expanded Contrib.", length(data2$new.age.factor))
d2per <- data2$new.age
d2per[d2per==1] <- 56.76
d2per[d2per==2] <- 19.82
d2per[d2per==3] <- 16.82
d2per[d2per==4] <- 06.61
d2per.factor <- as.factor(d2per)
data3$new.age <- as.numeric(cut(data3$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data3$new.age)
data3$new.age.factor <- as.factor(data3$new.age)
hist(data3$new.age)
age3 <-c(0.2556818, 0.2954545, 0.2405303, 0.2083333)
d3label <- rep("Contrib. Files", length(data3$new.age.factor))
d3per <- data3$new.age
d3per[d3per==1] <- 25.57
d3per[d3per==2] <- 29.55
d3per[d3per==3] <- 24.05
d3per[d3per==4] <- 20.83
d3per.factor <- as.factor(d3per)
data4$new.age <- as.numeric(cut(data4$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data4$new.age)
data4$new.age.factor <- as.factor(data4$new.age)
hist(data4$new.age)
age4 <- c(0.5, 0.125, 0.125, 0.25)
d4label <- rep("Contrib. Rosters", length(data4$new.age.factor))
d4per <- data4$new.age
d4per[d4per==1] <- 57.14
d4per[d4per==2] <- 14.29
d4per[d4per==3] <- 14.29
d4per[d4per==4] <- 28.57
d4per.factor <- as.factor(d4per)
all_per <- c(d1per.factor, d2per.factor, d3per.factor, d4per.factor)
all_persss <- c(d1per, d2per, d3per, d4per)
all_labels <- c(d1label, d2label, d3label, d4label)
all_age_groups <- c(data1$new.age.factor, data2$new.age.factor, data3$new.age.factor, data4$new.age.factor)
d5 <- data.frame(labels = all_labels,
age_groups = all_age_groups,
per = all_per,
persss = all_persss)
d5 <- na.omit(d5)
g <- ggplot(d5, aes(fill=forcats::fct_rev(age_groups), y = 1, x=forcats::fct_rev(labels))) +
geom_bar(position="fill", stat="identity") +
scale_fill_discrete(name = "Project Age Group", labels = c("15-16y", "12-15y", "9-12y", "0-9y"), guide = guide_legend(reverse = TRUE)) +
xlab("Dataset") +
ylab("Age Grouping Percentage") +
theme_bw()+
theme(axis.text.x = element_text(angle = 0), legend.position="top")
g
sdata1$new_milestones <- as.numeric(data1$milestone_count > 0) + 1
data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
mmtmodel1 <- lm(underproduction_mean ~ mmt + as.factor(new.age), data=data1)
summary(mmtmodel1)
agemodel1 <- lm(mmt ~ age_of_project, data=data1)
summary(agemodel1)
fsmodel2 <- lm(underproduction_mean ~ new.formal.score, data=data1)
summary(fsmodel2)
g <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g3
cor.test(data1$mmt, data1$new.age)
age_data <- subset(data1, !is.na(new.age))
g2 <- ggplot(age_data, aes(x=factor(new.age), y=mmt))+
geom_boxplot()
g2

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 65 KiB

After

Width:  |  Height:  |  Size: 50 KiB

9
R/plotting_age.R Normal file
View File

@ -0,0 +1,9 @@
rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2)
data1 <- read_csv('../age_percentages.csv',show_col_types = FALSE)
head(data1)

BIN
R/temp-mmt-colors.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

BIN
R/temp-temp.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

5
age_percentages.csv Normal file
View File

@ -0,0 +1,5 @@
setname,age1,age2,age3,age4
data1,0.39369,0.239271,0.2096806,0.1573584
data2,0.5675676,0.1981982,0.1681682,0.06606607
data3,0.2556818,0.2954545,0.2405303,0.2083333
data4,0.5714286,0.1428571,0.1428571,0.2857143
1 setname age1 age2 age3 age4
2 data1 0.39369 0.239271 0.2096806 0.1573584
3 data2 0.5675676 0.1981982 0.1681682 0.06606607
4 data3 0.2556818 0.2954545 0.2405303 0.2083333
5 data4 0.5714286 0.1428571 0.1428571 0.2857143

View File

@ -1,8 +0,0 @@
{\rtf1\ansi\ansicpg1252\cocoartf2708
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
\f0\fs24 \cf0 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}