From 00a1c5d1574a257b7a01cb8a8f9fca7cad6d86bf Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Thu, 13 Jun 2024 13:40:27 -0500
Subject: [PATCH 1/3] updating EDA around outcome variables

---
 R/contribRDDAnalysis.R | 5 +++++
 R/readmeRDDAnalysis.R  | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/R/contribRDDAnalysis.R b/R/contribRDDAnalysis.R
index 0aa4d1d..a22bfc9 100644
--- a/R/contribRDDAnalysis.R
+++ b/R/contribRDDAnalysis.R
@@ -43,6 +43,11 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
 #logging
 all_actions_data$logged_count <- log(all_actions_data$count)
 all_actions_data$log1p_count <- log1p(all_actions_data$count)
+#EDA
+range(all_actions_data$log1p_count) # 0.000000 6.745236
+mean(all_actions_data$log1p_count) # 1.200043
+var(all_actions_data$log1p_count) # 1.753764
+median(all_actions_data$log1p_count) # 0.6931472
 # now for merge
 mrg_actions_data$logged_count <- log(mrg_actions_data$count)
 mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
diff --git a/R/readmeRDDAnalysis.R b/R/readmeRDDAnalysis.R
index 3a4d644..baa8db3 100644
--- a/R/readmeRDDAnalysis.R
+++ b/R/readmeRDDAnalysis.R
@@ -45,6 +45,7 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg")
 #log the dependent 
 all_actions_data$logged_count <- log(all_actions_data$count)
 all_actions_data$log1p_count <- log1p(all_actions_data$count)
+range(all_actions_data$log1p_count)
 # 3 rdd in lmer analysis
 # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
 # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
@@ -55,8 +56,10 @@ library(lattice)
 #some more EDA to go between Poisson and neg binomial
 var(all_actions_data$log1p_count) # 1.125429
 mean (all_actions_data$log1p_count) # 0.6426873
+median(all_actions_data$log1p_count) #0
 var(all_actions_data$count) # 268.4449
 mean (all_actions_data$count) # 3.757298
+median(all_actions_data$count) # 0
 #all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1,  control=glmerControl(optimizer="bobyqa",
 #                           optCtrl=list(maxfun=1e5)))
 all_log1p_gmodel <- readRDS("final_models/0510_rm_all.rda")

From ef25337e554d6a3f12116b9213a6e8113222a80c Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Sun, 16 Jun 2024 13:40:05 -0500
Subject: [PATCH 2/3] sampling for qual_analysis

---
 text_analysis/qual_sampling.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 text_analysis/qual_sampling.py

diff --git a/text_analysis/qual_sampling.py b/text_analysis/qual_sampling.py
new file mode 100644
index 0000000..bb77747
--- /dev/null
+++ b/text_analysis/qual_sampling.py
@@ -0,0 +1,26 @@
+import csv
+import io 
+import shutil 
+import os 
+from random import sample
+
+readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme"
+contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing"
+
+
+def sample_from_doc(sample_k, doc_directory):
+    subdirs = os.listdir(doc_directory)
+    for dir in subdirs: 
+        print(dir)
+        files = os.listdir(doc_directory + "/" + dir)
+        final_sampled = []
+        while len(final_sampled) < sample_k:
+            trial_sample = sample(files, 1)[0]
+            with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f:
+                file_length = len(f.readlines())
+                if file_length >= 10:
+                    final_sampled.append([trial_sample, file_length])
+        print(final_sampled)
+
+if __name__ == "__main__":
+    sample_from_doc(3, readme_wd)
\ No newline at end of file

From e2da0d95a9884695a4d4b5d49b025a4eea691f1e Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Sun, 16 Jun 2024 18:35:58 -0500
Subject: [PATCH 3/3] checking files across major ds

---
 redo_denom.py | 44 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/redo_denom.py b/redo_denom.py
index f368ca4..8af278a 100644
--- a/redo_denom.py
+++ b/redo_denom.py
@@ -2,6 +2,44 @@ import json
 import os
 import csv
 import pandas as pd
+from git import Repo
+from tqdm import tqdm
+import shutil 
+
+temp_dir = "/data/users/mgaughan/tmp3/"
+
+def how_many_docs(dataset_csv):
+    df = pd.read_csv(dataset_csv)
+    project_repos = df['upstream_vcs_link'].to_list()
+    print(len(project_repos))
+    readme_count = 0
+    contributing_count  = 0
+    for i in tqdm(range(len(project_repos))):
+        vcs_link = project_repos[i]
+        if "github" in vcs_link or "gitlab" in vcs_link:
+            #making an evaluation that sub branches aren't being used and that people would fork if needed
+            #this only looks at main
+            vcs_link = "/".join(vcs_link.split("/")[0:5])
+            full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
+        else:
+            full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
+        vcs_link = vcs_link.strip()
+        repo = Repo.clone_from(vcs_link, full_temp_path)
+        files = os.listdir(full_temp_path)
+        has_readme = False
+        has_contributing = False
+        for file in files:
+            if "README" in file.upper():
+                has_readme = True
+            if "CONTRIBUTING" in file.upper():
+                has_contributing = True
+        if has_readme:
+            readme_count += 1
+        if has_contributing:
+            contributing_count += 1
+        shutil.rmtree(full_temp_path, ignore_errors=True)
+    return readme_count, contributing_count
+
 
 
 def calc_file_denom(project_name):
@@ -33,5 +71,7 @@ def for_all_projects():
 
 
 if __name__ == "__main__":
-    for_all_projects()
-    #print(calc_file_denom("zzz-to-char"))
\ No newline at end of file
+    #for_all_projects()
+    #print(calc_file_denom("zzz-to-char"))
+    readmec, contributingc = how_many_docs("final_data/deb_full_data.csv")
+    print("README COUNT: " + str(readmec) + "|| CONTRIBUTING COUNT: " + str(contributingc))
\ No newline at end of file