new file to grab time-specific gov docs

2024-03-16 06:14:03 -05:00 · 2024-03-16 06:14:03 -05:00 · eaa84d33e0
commit eaa84d33e0
parent 65d970bbd3
2 changed files with 76 additions and 9 deletions
--- a/get_spec_file.py
+++ b/get_spec_file.py
@ -0,0 +1,66 @@
 import csv
 from git import Repo
 import os
 import datetime as dt
 import time
 import shutil 
 import pandas as pd
 import dateutil
 from tqdm import tqdm
 import math
 import io 
 working_dir = "/data/users/mgaughan/kkex/time_specific_files_readme"
 temp_dir = "/data/users/mgaughan/tmp3/"
 # getting the specific readme or contributing file from a given commit 
 # inputs: upstream vcs link, commit hash, yes/no is it a readme 
 def get_file(vcs_link, commit_hash, is_readme):
    if "github" in vcs_link or "gitlab" in vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
        #this only looks at main
        vcs_link = "/".join(vcs_link.split("/")[0:5])
        full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
    else:
        full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
    repo = Repo.clone_from(vcs_link, full_temp_path)
    commit = repo.commit(commit_hash)
    #getting the name of the file from the root directory
    for filename in os.listdir(full_temp_path):
        if is_readme:
            target_filename = "README.md"
            if "README" in filename:
                target_filename = filename
        else:
            target_filename = "CONTRIBUTING.md"
            if "CONTRIBUTING" in filename:
                target_filename = filename
    targetfile = commit.tree / target_filename
    if is_readme:
        last_path = "readme"
    else:
        last_path = "contributing"
    with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + last_path + ".md", "w") as file:
        with io.BytesIO(targetfile.data_stream.read()) as f:
            file.write(f.read().decode('utf-8'))
        file.close()
    shutil.rmtree(full_temp_path, ignore_errors=True)
 def for_all_files():
    #toggle this based on readme or contributing files
    readme_is = True
    csv_path = "kk_test_031324_pr_did.csv"
    with open(csv_path, 'r') as file:
        #csv_reader = csv.DictReader(file)
        lines = [line for line in file]
        for row in tqdm(csv.reader(lines), total=len(lines)):
            if row[0] == "":
                continue
            #print(row[0])
            #get_file(row[0], row[?], readme_is)
            get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
 if __name__ == "__main__":
    for_all_files()
    #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
    #get_file('https://github.com/krahets/hello-algo/tree/dev1', 'f615ad42ef3c58cfc6f080b8fb0cd0eb741706a9', True )
--- a/pr_data_get.py
+++ b/pr_data_get.py
@ -12,7 +12,7 @@ import math
 key = os.environ.get('KKEXKEY')
 early_cutoff = dt.datetime(2008,2, 8)
-temp_dir = "/data/users/mgaughan/tmp"
+temp_dir = "/data/users/mgaughan/tmp/"
 '''
 - rate of change, rate of all/day
@ -24,8 +24,6 @@ def file_get_pr(upstream_vcs_link, me_read):
    #print(upstream_vcs_link.split('/')[4])
    project_dict = {}
    project_dict['upstream_vcs_link'] = upstream_vcs_link
    if upstream_vcs_link == " https://gitlab.com/ubports/core/cmake-extras":
        return {}
    upstream_vcs_link = upstream_vcs_link.strip()
    if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
@ -36,6 +34,9 @@ def file_get_pr(upstream_vcs_link, me_read):
    else:
        full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
    print(upstream_vcs_link)
    if upstream_vcs_link == "https://gitlab.com/ubports/core":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        return {}
    repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
    try:
        commits = repo.fetch()
@ -144,13 +145,13 @@ def pr_count(start, end, commits, author_roster, commit_roster):
            return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
 def for_files():
-    csv_path = "final_data/kk_final_readme_roster.csv"
+    csv_path = "final_data/deb_readme_roster.csv"
    count = 0 
    with open(csv_path, 'r') as file:
        csv_reader = csv.DictReader(file)
-        with open('kk_test_031424_pr_did.csv', "w") as writing_file:
+        with open('kk_test_031624_pr_did.csv', "w") as writing_file:
            # this would also have to get switched fro the cont dataset
-            keys = ['upstream_vcs_link', "first_readme", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
+            keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
            dict_writer = csv.DictWriter(writing_file, keys)
            dict_writer.writeheader()
            for row in csv_reader:
@ -162,9 +163,9 @@ def for_files():
 if __name__ == "__main__":
-    #for_files()
+    for_files()
-    file_get_pr("https://github.com/tqdm/tqdm", True)
+    #file_get_pr("https://github.com/tqdm/tqdm", True)
    #file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
    #file_get_pr("https://github.com/walling/unorm/issues/new/", True)
-    file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)
+    #file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)