Merge branch 'master' of https://gitea.communitydata.science/mgaughan/kkex_repo
This commit is contained in:
commit
ea8677c3f1
@ -43,6 +43,11 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
|
|||||||
#logging
|
#logging
|
||||||
all_actions_data$logged_count <- log(all_actions_data$count)
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
||||||
|
#EDA
|
||||||
|
range(all_actions_data$log1p_count) # 0.000000 6.745236
|
||||||
|
mean(all_actions_data$log1p_count) # 1.200043
|
||||||
|
var(all_actions_data$log1p_count) # 1.753764
|
||||||
|
median(all_actions_data$log1p_count) # 0.6931472
|
||||||
# now for merge
|
# now for merge
|
||||||
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
||||||
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
||||||
|
@ -45,6 +45,7 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
|
|||||||
#log the dependent
|
#log the dependent
|
||||||
all_actions_data$logged_count <- log(all_actions_data$count)
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
||||||
|
range(all_actions_data$log1p_count)
|
||||||
# 3 rdd in lmer analysis
|
# 3 rdd in lmer analysis
|
||||||
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
||||||
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
||||||
@ -55,8 +56,10 @@ library(lattice)
|
|||||||
#some more EDA to go between Poisson and neg binomial
|
#some more EDA to go between Poisson and neg binomial
|
||||||
var(all_actions_data$log1p_count) # 1.125429
|
var(all_actions_data$log1p_count) # 1.125429
|
||||||
mean (all_actions_data$log1p_count) # 0.6426873
|
mean (all_actions_data$log1p_count) # 0.6426873
|
||||||
|
median(all_actions_data$log1p_count) #0
|
||||||
var(all_actions_data$count) # 268.4449
|
var(all_actions_data$count) # 268.4449
|
||||||
mean (all_actions_data$count) # 3.757298
|
mean (all_actions_data$count) # 3.757298
|
||||||
|
median(all_actions_data$count) # 0
|
||||||
#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa",
|
#all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa",
|
||||||
# optCtrl=list(maxfun=1e5)))
|
# optCtrl=list(maxfun=1e5)))
|
||||||
all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
|
all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")
|
||||||
|
@ -2,6 +2,44 @@ import json
|
|||||||
import os
|
import os
|
||||||
import csv
|
import csv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from git import Repo
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
temp_dir = "/data/users/mgaughan/tmp3/"
|
||||||
|
|
||||||
|
def how_many_docs(dataset_csv):
|
||||||
|
df = pd.read_csv(dataset_csv)
|
||||||
|
project_repos = df['upstream_vcs_link'].to_list()
|
||||||
|
print(len(project_repos))
|
||||||
|
readme_count = 0
|
||||||
|
contributing_count = 0
|
||||||
|
for i in tqdm(range(len(project_repos))):
|
||||||
|
vcs_link = project_repos[i]
|
||||||
|
if "github" in vcs_link or "gitlab" in vcs_link:
|
||||||
|
#making an evaluation that sub branches aren't being used and that people would fork if needed
|
||||||
|
#this only looks at main
|
||||||
|
vcs_link = "/".join(vcs_link.split("/")[0:5])
|
||||||
|
full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
|
||||||
|
else:
|
||||||
|
full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
|
||||||
|
vcs_link = vcs_link.strip()
|
||||||
|
repo = Repo.clone_from(vcs_link, full_temp_path)
|
||||||
|
files = os.listdir(full_temp_path)
|
||||||
|
has_readme = False
|
||||||
|
has_contributing = False
|
||||||
|
for file in files:
|
||||||
|
if "README" in file.upper():
|
||||||
|
has_readme = True
|
||||||
|
if "CONTRIBUTING" in file.upper():
|
||||||
|
has_contributing = True
|
||||||
|
if has_readme:
|
||||||
|
readme_count += 1
|
||||||
|
if has_contributing:
|
||||||
|
contributing_count += 1
|
||||||
|
shutil.rmtree(full_temp_path, ignore_errors=True)
|
||||||
|
return readme_count, contributing_count
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def calc_file_denom(project_name):
|
def calc_file_denom(project_name):
|
||||||
@ -33,5 +71,7 @@ def for_all_projects():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for_all_projects()
|
#for_all_projects()
|
||||||
#print(calc_file_denom("zzz-to-char"))
|
#print(calc_file_denom("zzz-to-char"))
|
||||||
|
readmec, contributingc = how_many_docs("final_data/deb_full_data.csv")
|
||||||
|
print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc))
|
26
text_analysis/qual_sampling.py
Normal file
26
text_analysis/qual_sampling.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import csv
|
||||||
|
import io
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
from random import sample
|
||||||
|
|
||||||
|
readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme"
|
||||||
|
contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing"
|
||||||
|
|
||||||
|
|
||||||
|
def sample_from_doc(sample_k, doc_directory):
|
||||||
|
subdirs = os.listdir(doc_directory)
|
||||||
|
for dir in subdirs:
|
||||||
|
print(dir)
|
||||||
|
files = os.listdir(doc_directory + "/" + dir)
|
||||||
|
final_sampled = []
|
||||||
|
while len(final_sampled) < sample_k:
|
||||||
|
trial_sample = sample(files, 1)[0]
|
||||||
|
with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f:
|
||||||
|
file_length = len(f.readlines())
|
||||||
|
if file_length >= 10:
|
||||||
|
final_sampled.append([trial_sample, file_length])
|
||||||
|
print(final_sampled)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sample_from_doc(3, readme_wd)
|
Loading…
Reference in New Issue
Block a user