1
0

overdue backup

This commit is contained in:
Matthew Gaughan 2025-02-14 12:58:27 -06:00
parent 44cc0d0bb7
commit 663862c7d8
18 changed files with 736435 additions and 318 deletions

View File

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -4648,7 +4648,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -4691,7 +4691,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,219 @@
import git
from tqdm import tqdm
import csv
import os
import shutil
import time
import pandas as pd
import datetime
import argparse
'''
RUNNING INSTRUCTIONS:
[0] set up ssh-agent and add id_rsa to it
[1] set up tmux environment
[2] edit this file where marked "FIX BELOW"
[3] install pip packages
[4] in your tmux environment, run the following three commands
- os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
- os.environ['GIT_ASKPASS'] = 'false'
- os.environ['GIT_TERMINAL_PROMPT'] = '0'
[5] in tmux, run the script as follows with your START and STOP values
- python3 intermediary_script.py --start_index START --stop_index STOP
[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
THANK YOU VERY MUCH - matt
'''
#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
temp_dir = "/data/users/mgaughan/tmp3"
cst = datetime.timezone(datetime.timedelta(hours=-6))
from_date = datetime.datetime(2010, 1, 1, 12, 00, 00, tzinfo=cst)
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/bot_frameworks/"
def temp_clone(vcs_link, temp_location):
"""
ARGS
vcs_link : url link to upstream repo vcs
temp_location : filepath to where the repo should be cloned to
RETURNS
repo : the GitRepository object of the cloned repo
repo_path : the filepath to the cloned repository
"""
#print(temp_location)
vcs_link = vcs_link.strip()
os.makedirs(temp_location)
repo_path = temp_location
repo = git.Repo.clone_from(vcs_link, repo_path)
print(f"Successfully Cloned {vcs_link}")
return repo, repo_path
def delete_clone(temp_location):
"""
ARGS
temp_location : filepath to the cloned repository
RETURNS
whether or not the deletion was a success
"""
if os.path.exists(temp_location):
shutil.rmtree(temp_location)
print(f"{temp_location} has been deleted.")
return 0
else:
print("No clone at location")
return 1
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
def commit_analysis(repo, cutoff_date, start_date):
print("Analyzing Commits...")
commits_info = []
for commit in repo.iter_commits():
# if too far back, break
if commit.committed_datetime > start_date:
continue
if commit.committed_datetime < cutoff_date:
break
commit_info = {
"commit_hash": commit.hexsha,
"author_name": commit.author.name,
"author_email": commit.author.email,
"authored_date": commit.authored_datetime,
"committer_name": commit.committer.name,
"committer_email": commit.committer.email,
"commit_date": commit.committed_datetime,
"message": commit.message,
"is_merge": len(commit.parents) > 1,
}
# author/committer org information
commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
# some more effort to get this information
commit_info["branches"] = repo.git.branch(
"--contains", commit_info["commit_hash"]
)
# diff information
diffs = commit.diff(
commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
)
commit_info["diff_info"] = diff_analysis(diffs)
# print(commit_info)
commits_info.append(commit_info)
return commits_info
def diff_analysis(diffs):
diff_objects = []
for diff in diffs:
diff_info = {
"lines_added": sum(
1
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
if line.startswith("+") and not line.startswith("+++")
),
"lines_deleted": sum(
1
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
if line.startswith("-") and not line.startswith("---")
),
"parent_filepath": diff.a_path,
"child_filepath": diff.b_path,
"change_type": diff.change_type,
"new_file": diff.new_file,
"deleted_file": diff.deleted_file,
"renamed_file": diff.renamed,
#'diff': diff.diff.decode('utf-8')
}
diff_objects.append(diff_info)
return diff_objects
def for_all_files(start_index, stop_index):
cwd = os.getcwd()
#csv_path = "for_batching/deb_full_data.csv"
csv_path = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/frameworks_for_collection.csv"
index = -1
saved = []
empty_row = 0
clone_error =[]
has_readme = 0
has_contributing = 0
try:
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index < start_index:
continue
time.sleep(5)
if row[0] == "":
empty_row += 1
continue
#row[5] = upstream vcs
temp_repo_path = ""
und_repo_id = ""
try:
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
os.environ['GIT_ASKPASS'] = 'false'
os.environ['GIT_TERMINAL_PROMPT'] = '0'
ssh_url = ""
try:
if "github" in row[0]:
repo_id = row[0][len('https://github.com/'):]
ssh_url = f'git@github.com:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
else:
parts = row[0].split('/')
domain = parts[2]
repo_id = '/'.join(parts[3:])
try:
temp_repo, temp_repo_path = temp_clone(row[0], temp_dir)
except Exception as e:
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
delete_clone(temp_dir)
ssh_url = f'git@{domain}:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
except Exception as e:
print(f'cloning error at {row[0]}')
print(f'inside cloning error: {e}')
raise ValueError(e)
commits_array = commit_analysis(temp_repo, from_date, to_date)
commits_df = pd.DataFrame.from_records(commits_array)
und_repo_id = '_'.join(repo_id.split("/"))
commits_df.to_csv(
f"{COMMIT_SAVE_PREFIX}{und_repo_id}_commits.csv",
index=False,
)
except Exception as e:
clone_error.append([row[0], str(e)])
print(f"outside cloning error: {e}")
finally:
und_repo_id = ""
delete_clone(temp_dir)
os.chdir(cwd)
if index == stop_index:
break
except KeyboardInterrupt:
print("KeyBoardInterrrupt")
finally:
print(clone_error)
with open(f"s_{start_index}_{stop_index}-clone-error-output.txt", "w") as txt_file:
for error in clone_error:
txt_file.write(', '.join(error) + "\n")
with open(f"s_{start_index}_{stop_index}-success-output.txt", "w") as txt_file:
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
txt_file.write(f"Number that has README: {has_readme} \n")
txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
if __name__ == "__main__":
for_all_files(1, 30)

View File

@ -10,8 +10,8 @@ repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
cst = datetime.timezone(datetime.timedelta(hours=-6))
repos = {
"parsoid" : {
"url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
"wmf_config" : {
"url": "https://gerrit.wikimedia.org/r/operations/mediawiki-config",
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
},

View File

@ -1,18 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,7 +1,7 @@
#!/bin/bash
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
QUERY_STRING="before:2013-03-29+visualeditor"
QUERY_STRING="project:mediawiki/core+before:2013-03-30+HTTPS"
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -16,7 +16,7 @@ def query_task_tag(
ts1 = None, ts2 = None,
api_url_base = 'https://phabricator.wikimedia.org/api/maniphest.search',
api_token = "api-wurg254ciq5uvfxlr4rszn5ynpy4",
sleep = 10
sleep = 7
):
'''
query all tasks tagged with specific tag
@ -55,9 +55,9 @@ def query_task_tag(
}
response = requests.get( api_url_base, params=params)
print(response)
#print(response)
result = json.loads(response.text)['result']
print(result)
#print(result)
## the data
data_tmp = result['data']
data += data_tmp
@ -79,7 +79,7 @@ def query_transactions_phid_task(
limit = 100,
api_url_base = 'https://phabricator.wikimedia.org/api/transaction.search',
api_token = 'api-grocgdq2767cx6v5ywckkjmetx2f',
sleep = 10,
sleep = 7,
):
'''
query all transactions for a task (task_phid).
@ -125,17 +125,21 @@ def query_transactions_phid_task(
if __name__ == "__main__":
# phab=Phabricator("https://phabricator.wikimedia.org/")
tags = [
"VisualEditor",
"Parsoid"
"Parsoid",
"https"
]
#set phabricator api token
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
api_base = 'https://phabricator.wikimedia.org/api/'
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2022, 1, 1, 0, 0, 0)))
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
for tag in tags:
if tag == "https":
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2013, 1, 1, 0, 0, 0)))
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2016, 12, 31, 0, 0, 0)))
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
for entry in p_data:
@ -145,7 +149,7 @@ if __name__ == "__main__":
comments = {}
for item in transactions:
comments[item['id']] = item['comments']
entry['task_commeSigbjorn Finnents'] = comments
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
entry['task_comments'] = comments
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
with open(f"{DATA_PREFIX}{tag}_phab_data.json", "w") as outfile1:
json.dump(p_data, outfile1)

View File

@ -436,7 +436,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "base",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},

View File

@ -1,6 +0,0 @@
def main():
print('jkasdfjhksdjhksdfsd')
if __name__ == "__main__":
main()