backup for expanded contributor data

This commit is contained in:
Matthew Gaughan 2024-01-22 11:20:52 -06:00
parent 2473daf7f2
commit 1ae6c6ce7e
4 changed files with 2160 additions and 11 deletions

2129
011824_uni_contrib.csv Normal file

File diff suppressed because it is too large Load Diff

21
cleaning_contrib_files.py Normal file
View File

@ -0,0 +1,21 @@
import csv
import os
import json
def csv_count():
with open("011824_uni_contrib.csv", "r") as file:
reader = csv.reader(file)
true_rep_counter = 0
for i, line in enumerate(reader):
if line[2] == line[3] == line[4] == line[5] == '0':
print("zeroes")
else:
print(line)
true_rep_counter += 1
print(true_rep_counter)
if __name__ == "__main__":
csv_count()

View File

@ -7,6 +7,7 @@ nltk.download('wordnet')
def main():
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
all_word_counts = []
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
print(filename)
@ -21,7 +22,11 @@ def main():
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs?
unique_word_count = len(lemmatized_words)
if "checklist" in lemmatized_words or "process" in lemmatized_words:
print('contains keyword')
print(word_count)
all_word_counts.append(unique_word_count)
print(sum(all_word_counts)/len(all_word_counts))

View File

@ -9,7 +9,7 @@ import ast
#sys.path.append('../octohatrack')
#import octohatrack.__main__ as oh
csv_path = "011523_uni_contrib.csv"
csv_path = "011824_uni_contrib.csv"
def main():
wd = os.getcwd()
@ -24,19 +24,18 @@ def main():
if "github" not in line[5]:
continue
repo_name = line[5].strip()[19:]
print(repo_name)
project_dict["project_name"] = repo_name.split("/")[1]
project_dict["project_owner"]= repo_name.split("/")[0]
os.chdir("../octohatrack")
os.environ["GITHUB_TOKEN"] =
try:
octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True).stdout
octohatrack_results = subprocess.run(['python3', '-m', 'octohatrack', repo_name, '--wait-for-reset'], capture_output = True, text=True, timeout=60).stdout
except:
print("issue with the repository, string indices must be integers")
continue
os.chdir(wd)
split_results = octohatrack_results.split("\n")
for entry in split_results:
print("-------------")
if "{'api_contributors'" in entry:
formatted = ast.literal_eval(entry)
project_dict["api_contrib_count"] = len(formatted['api_contributors'])
@ -47,15 +46,10 @@ def main():
#project_dict["file_contrib_list"] = formatted['file_contributors']
project_dict["wiki_contrib_count"] = len(formatted['file_contributors'])
#project_dict["wiki_contrib_list"] = formatted['file_contributors']
print(project_dict)
dict_writer.writerow(project_dict)
with open('/data/users/mgaughan/kkex_contrib_uni_011523/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path:
with open('/data/users/mgaughan/b_kkex_contrib_uni_011824/' + 'contrib_roster_' + project_dict["project_name"] + '.json', 'w') as data_path:
json.dump(formatted, data_path)
#os.chdir("../octohatrack")
#subprocess.run(['python3', '-m', 'octohatrack', 'bluesky-social/atproto'])
#os.chdir(wd)
#def parse_results(string_results):
# for letter in string_results:
if __name__ == "__main__":
main()