From 00a1c5d1574a257b7a01cb8a8f9fca7cad6d86bf Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 13 Jun 2024 13:40:27 -0500 Subject: [PATCH 1/3] updating EDA around outcome variables --- R/contribRDDAnalysis.R | 5 +++++ R/readmeRDDAnalysis.R | 3 +++ 2 files changed, 8 insertions(+) diff --git a/R/contribRDDAnalysis.R b/R/contribRDDAnalysis.R index 0aa4d1d..a22bfc9 100644 --- a/R/contribRDDAnalysis.R +++ b/R/contribRDDAnalysis.R @@ -43,6 +43,11 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg") #logging all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) +#EDA +range(all_actions_data$log1p_count) # 0.000000 6.745236 +mean(all_actions_data$log1p_count) # 1.200043 +var(all_actions_data$log1p_count) # 1.753764 +median(all_actions_data$log1p_count) # 0.6931472 # now for merge mrg_actions_data$logged_count <- log(mrg_actions_data$count) mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) diff --git a/R/readmeRDDAnalysis.R b/R/readmeRDDAnalysis.R index 3a4d644..baa8db3 100644 --- a/R/readmeRDDAnalysis.R +++ b/R/readmeRDDAnalysis.R @@ -45,6 +45,7 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg") #log the dependent all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) +range(all_actions_data$log1p_count) # 3 rdd in lmer analysis # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc @@ -55,8 +56,10 @@ library(lattice) #some more EDA to go between Poisson and neg binomial var(all_actions_data$log1p_count) # 1.125429 mean (all_actions_data$log1p_count) # 0.6426873 +median(all_actions_data$log1p_count) #0 var(all_actions_data$count) # 268.4449 mean (all_actions_data$count) # 3.757298 +median(all_actions_data$count) # 0 #all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=1e5))) all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda") From ef25337e554d6a3f12116b9213a6e8113222a80c Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 16 Jun 2024 13:40:05 -0500 Subject: [PATCH 2/3] sampling for qual_analysis --- text_analysis/qual_sampling.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 text_analysis/qual_sampling.py diff --git a/text_analysis/qual_sampling.py b/text_analysis/qual_sampling.py new file mode 100644 index 0000000..bb77747 --- /dev/null +++ b/text_analysis/qual_sampling.py @@ -0,0 +1,26 @@ +import csv +import io +import shutil +import os +from random import sample + +readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme" +contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing" + + +def sample_from_doc(sample_k, doc_directory): + subdirs = os.listdir(doc_directory) + for dir in subdirs: + print(dir) + files = os.listdir(doc_directory + "/" + dir) + final_sampled = [] + while len(final_sampled) < sample_k: + trial_sample = sample(files, 1)[0] + with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f: + file_length = len(f.readlines()) + if file_length >= 10: + final_sampled.append([trial_sample, file_length]) + print(final_sampled) + +if __name__ == "__main__": + sample_from_doc(3, readme_wd) \ No newline at end of file From e2da0d95a9884695a4d4b5d49b025a4eea691f1e Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 16 Jun 2024 18:35:58 -0500 Subject: [PATCH 3/3] checking files across major ds --- redo_denom.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/redo_denom.py b/redo_denom.py index f368ca4..8af278a 100644 --- a/redo_denom.py +++ b/redo_denom.py @@ -2,6 +2,44 @@ import json import os import csv import pandas as pd +from git import Repo +from tqdm import tqdm +import shutil + +temp_dir = "/data/users/mgaughan/tmp3/" + +def how_many_docs(dataset_csv): + df = pd.read_csv(dataset_csv) + project_repos = df['upstream_vcs_link'].to_list() + print(len(project_repos)) + readme_count = 0 + contributing_count = 0 + for i in tqdm(range(len(project_repos))): + vcs_link = project_repos[i] + if "github" in vcs_link or "gitlab" in vcs_link: + #making an evaluation that sub branches aren't being used and that people would fork if needed + #this only looks at main + vcs_link = "/".join(vcs_link.split("/")[0:5]) + full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" + else: + full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" + vcs_link = vcs_link.strip() + repo = Repo.clone_from(vcs_link, full_temp_path) + files = os.listdir(full_temp_path) + has_readme = False + has_contributing = False + for file in files: + if "README" in file.upper(): + has_readme = True + if "CONTRIBUTING" in file.upper(): + has_contributing = True + if has_readme: + readme_count += 1 + if has_contributing: + contributing_count += 1 + shutil.rmtree(full_temp_path, ignore_errors=True) + return readme_count, contributing_count + def calc_file_denom(project_name): @@ -33,5 +71,7 @@ def for_all_projects(): if __name__ == "__main__": - for_all_projects() - #print(calc_file_denom("zzz-to-char")) \ No newline at end of file + #for_all_projects() + #print(calc_file_denom("zzz-to-char")) + readmec, contributingc = how_many_docs("final_data/deb_full_data.csv") + print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc)) \ No newline at end of file