overdue backup
This commit is contained in:
parent
44cc0d0bb7
commit
663862c7d8
22644
src/helper_scripts/cleaning_scripts/0214_https_gerrit_test.csv
Normal file
22644
src/helper_scripts/cleaning_scripts/0214_https_gerrit_test.csv
Normal file
File diff suppressed because it is too large
Load Diff
5387
src/helper_scripts/cleaning_scripts/0214_ve_gerrit_test.csv
Normal file
5387
src/helper_scripts/cleaning_scripts/0214_ve_gerrit_test.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -4648,7 +4648,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -4691,7 +4691,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
22801
src/helper_scripts/cleaning_scripts/gerrit_cleaning.ipynb
Normal file
22801
src/helper_scripts/cleaning_scripts/gerrit_cleaning.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
219
src/helper_scripts/intermediary_script.py
Normal file
219
src/helper_scripts/intermediary_script.py
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
import git
|
||||||
|
from tqdm import tqdm
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
import datetime
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
'''
|
||||||
|
RUNNING INSTRUCTIONS:
|
||||||
|
[0] set up ssh-agent and add id_rsa to it
|
||||||
|
[1] set up tmux environment
|
||||||
|
[2] edit this file where marked "FIX BELOW"
|
||||||
|
[3] install pip packages
|
||||||
|
[4] in your tmux environment, run the following three commands
|
||||||
|
- os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
|
||||||
|
- os.environ['GIT_ASKPASS'] = 'false'
|
||||||
|
- os.environ['GIT_TERMINAL_PROMPT'] = '0'
|
||||||
|
[5] in tmux, run the script as follows with your START and STOP values
|
||||||
|
- python3 intermediary_script.py --start_index START --stop_index STOP
|
||||||
|
[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
|
||||||
|
|
||||||
|
THANK YOU VERY MUCH - matt
|
||||||
|
'''
|
||||||
|
|
||||||
|
#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
|
||||||
|
temp_dir = "/data/users/mgaughan/tmp3"
|
||||||
|
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
||||||
|
from_date = datetime.datetime(2010, 1, 1, 12, 00, 00, tzinfo=cst)
|
||||||
|
to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
|
||||||
|
#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
|
||||||
|
COMMIT_SAVE_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/bot_frameworks/"
|
||||||
|
|
||||||
|
def temp_clone(vcs_link, temp_location):
|
||||||
|
"""
|
||||||
|
ARGS
|
||||||
|
vcs_link : url link to upstream repo vcs
|
||||||
|
temp_location : filepath to where the repo should be cloned to
|
||||||
|
|
||||||
|
RETURNS
|
||||||
|
repo : the GitRepository object of the cloned repo
|
||||||
|
repo_path : the filepath to the cloned repository
|
||||||
|
"""
|
||||||
|
#print(temp_location)
|
||||||
|
vcs_link = vcs_link.strip()
|
||||||
|
os.makedirs(temp_location)
|
||||||
|
repo_path = temp_location
|
||||||
|
repo = git.Repo.clone_from(vcs_link, repo_path)
|
||||||
|
print(f"Successfully Cloned {vcs_link}")
|
||||||
|
return repo, repo_path
|
||||||
|
|
||||||
|
|
||||||
|
def delete_clone(temp_location):
|
||||||
|
"""
|
||||||
|
ARGS
|
||||||
|
temp_location : filepath to the cloned repository
|
||||||
|
|
||||||
|
RETURNS
|
||||||
|
whether or not the deletion was a success
|
||||||
|
"""
|
||||||
|
if os.path.exists(temp_location):
|
||||||
|
shutil.rmtree(temp_location)
|
||||||
|
print(f"{temp_location} has been deleted.")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print("No clone at location")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
|
||||||
|
def commit_analysis(repo, cutoff_date, start_date):
|
||||||
|
print("Analyzing Commits...")
|
||||||
|
commits_info = []
|
||||||
|
for commit in repo.iter_commits():
|
||||||
|
# if too far back, break
|
||||||
|
if commit.committed_datetime > start_date:
|
||||||
|
continue
|
||||||
|
if commit.committed_datetime < cutoff_date:
|
||||||
|
break
|
||||||
|
commit_info = {
|
||||||
|
"commit_hash": commit.hexsha,
|
||||||
|
"author_name": commit.author.name,
|
||||||
|
"author_email": commit.author.email,
|
||||||
|
"authored_date": commit.authored_datetime,
|
||||||
|
"committer_name": commit.committer.name,
|
||||||
|
"committer_email": commit.committer.email,
|
||||||
|
"commit_date": commit.committed_datetime,
|
||||||
|
"message": commit.message,
|
||||||
|
"is_merge": len(commit.parents) > 1,
|
||||||
|
}
|
||||||
|
# author/committer org information
|
||||||
|
commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
|
||||||
|
commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
|
||||||
|
# some more effort to get this information
|
||||||
|
commit_info["branches"] = repo.git.branch(
|
||||||
|
"--contains", commit_info["commit_hash"]
|
||||||
|
)
|
||||||
|
# diff information
|
||||||
|
diffs = commit.diff(
|
||||||
|
commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
|
||||||
|
)
|
||||||
|
commit_info["diff_info"] = diff_analysis(diffs)
|
||||||
|
# print(commit_info)
|
||||||
|
commits_info.append(commit_info)
|
||||||
|
return commits_info
|
||||||
|
|
||||||
|
|
||||||
|
def diff_analysis(diffs):
|
||||||
|
diff_objects = []
|
||||||
|
for diff in diffs:
|
||||||
|
diff_info = {
|
||||||
|
"lines_added": sum(
|
||||||
|
1
|
||||||
|
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
|
||||||
|
if line.startswith("+") and not line.startswith("+++")
|
||||||
|
),
|
||||||
|
"lines_deleted": sum(
|
||||||
|
1
|
||||||
|
for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
|
||||||
|
if line.startswith("-") and not line.startswith("---")
|
||||||
|
),
|
||||||
|
"parent_filepath": diff.a_path,
|
||||||
|
"child_filepath": diff.b_path,
|
||||||
|
"change_type": diff.change_type,
|
||||||
|
"new_file": diff.new_file,
|
||||||
|
"deleted_file": diff.deleted_file,
|
||||||
|
"renamed_file": diff.renamed,
|
||||||
|
#'diff': diff.diff.decode('utf-8')
|
||||||
|
}
|
||||||
|
diff_objects.append(diff_info)
|
||||||
|
return diff_objects
|
||||||
|
|
||||||
|
def for_all_files(start_index, stop_index):
|
||||||
|
cwd = os.getcwd()
|
||||||
|
#csv_path = "for_batching/deb_full_data.csv"
|
||||||
|
csv_path = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/frameworks_for_collection.csv"
|
||||||
|
index = -1
|
||||||
|
saved = []
|
||||||
|
empty_row = 0
|
||||||
|
clone_error =[]
|
||||||
|
has_readme = 0
|
||||||
|
has_contributing = 0
|
||||||
|
try:
|
||||||
|
with open(csv_path, 'r') as file:
|
||||||
|
csv_reader = csv.DictReader(file)
|
||||||
|
lines = [line for line in file]
|
||||||
|
for row in tqdm(csv.reader(lines), total=len(lines)):
|
||||||
|
index += 1
|
||||||
|
if index < start_index:
|
||||||
|
continue
|
||||||
|
time.sleep(5)
|
||||||
|
if row[0] == "":
|
||||||
|
empty_row += 1
|
||||||
|
continue
|
||||||
|
#row[5] = upstream vcs
|
||||||
|
temp_repo_path = ""
|
||||||
|
und_repo_id = ""
|
||||||
|
try:
|
||||||
|
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
|
||||||
|
os.environ['GIT_ASKPASS'] = 'false'
|
||||||
|
os.environ['GIT_TERMINAL_PROMPT'] = '0'
|
||||||
|
ssh_url = ""
|
||||||
|
try:
|
||||||
|
if "github" in row[0]:
|
||||||
|
repo_id = row[0][len('https://github.com/'):]
|
||||||
|
ssh_url = f'git@github.com:{repo_id}.git'
|
||||||
|
if ssh_url.endswith('.git.git'):
|
||||||
|
ssh_url = ssh_url[:-4]
|
||||||
|
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
||||||
|
else:
|
||||||
|
parts = row[0].split('/')
|
||||||
|
domain = parts[2]
|
||||||
|
repo_id = '/'.join(parts[3:])
|
||||||
|
try:
|
||||||
|
temp_repo, temp_repo_path = temp_clone(row[0], temp_dir)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
|
||||||
|
delete_clone(temp_dir)
|
||||||
|
ssh_url = f'git@{domain}:{repo_id}.git'
|
||||||
|
if ssh_url.endswith('.git.git'):
|
||||||
|
ssh_url = ssh_url[:-4]
|
||||||
|
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'cloning error at {row[0]}')
|
||||||
|
print(f'inside cloning error: {e}')
|
||||||
|
raise ValueError(e)
|
||||||
|
commits_array = commit_analysis(temp_repo, from_date, to_date)
|
||||||
|
commits_df = pd.DataFrame.from_records(commits_array)
|
||||||
|
und_repo_id = '_'.join(repo_id.split("/"))
|
||||||
|
commits_df.to_csv(
|
||||||
|
f"{COMMIT_SAVE_PREFIX}{und_repo_id}_commits.csv",
|
||||||
|
index=False,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
clone_error.append([row[0], str(e)])
|
||||||
|
print(f"outside cloning error: {e}")
|
||||||
|
finally:
|
||||||
|
und_repo_id = ""
|
||||||
|
delete_clone(temp_dir)
|
||||||
|
os.chdir(cwd)
|
||||||
|
|
||||||
|
if index == stop_index:
|
||||||
|
break
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("KeyBoardInterrrupt")
|
||||||
|
finally:
|
||||||
|
print(clone_error)
|
||||||
|
with open(f"s_{start_index}_{stop_index}-clone-error-output.txt", "w") as txt_file:
|
||||||
|
for error in clone_error:
|
||||||
|
txt_file.write(', '.join(error) + "\n")
|
||||||
|
with open(f"s_{start_index}_{stop_index}-success-output.txt", "w") as txt_file:
|
||||||
|
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
|
||||||
|
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
|
||||||
|
txt_file.write(f"Number that has README: {has_readme} \n")
|
||||||
|
txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
for_all_files(1, 30)
|
@ -10,8 +10,8 @@ repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
|
|||||||
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
||||||
|
|
||||||
repos = {
|
repos = {
|
||||||
"parsoid" : {
|
"wmf_config" : {
|
||||||
"url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
|
"url": "https://gerrit.wikimedia.org/r/operations/mediawiki-config",
|
||||||
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||||
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||||
},
|
},
|
||||||
|
@ -1,18 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
@ -1,7 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
|
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
|
||||||
QUERY_STRING="before:2013-03-29+visualeditor"
|
QUERY_STRING="project:mediawiki/core+before:2013-03-30+HTTPS"
|
||||||
|
|
||||||
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
|
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
|
||||||
|
|
685362
src/lib/gerrit_get/https_core_response_gerrit.json
Normal file
685362
src/lib/gerrit_get/https_core_response_gerrit.json
Normal file
File diff suppressed because it is too large
Load Diff
2
src/lib/gerrit_get/response.json
Normal file
2
src/lib/gerrit_get/response.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -16,7 +16,7 @@ def query_task_tag(
|
|||||||
ts1 = None, ts2 = None,
|
ts1 = None, ts2 = None,
|
||||||
api_url_base = 'https://phabricator.wikimedia.org/api/maniphest.search',
|
api_url_base = 'https://phabricator.wikimedia.org/api/maniphest.search',
|
||||||
api_token = "api-wurg254ciq5uvfxlr4rszn5ynpy4",
|
api_token = "api-wurg254ciq5uvfxlr4rszn5ynpy4",
|
||||||
sleep = 10
|
sleep = 7
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
query all tasks tagged with specific tag
|
query all tasks tagged with specific tag
|
||||||
@ -55,9 +55,9 @@ def query_task_tag(
|
|||||||
}
|
}
|
||||||
|
|
||||||
response = requests.get( api_url_base, params=params)
|
response = requests.get( api_url_base, params=params)
|
||||||
print(response)
|
#print(response)
|
||||||
result = json.loads(response.text)['result']
|
result = json.loads(response.text)['result']
|
||||||
print(result)
|
#print(result)
|
||||||
## the data
|
## the data
|
||||||
data_tmp = result['data']
|
data_tmp = result['data']
|
||||||
data += data_tmp
|
data += data_tmp
|
||||||
@ -79,7 +79,7 @@ def query_transactions_phid_task(
|
|||||||
limit = 100,
|
limit = 100,
|
||||||
api_url_base = 'https://phabricator.wikimedia.org/api/transaction.search',
|
api_url_base = 'https://phabricator.wikimedia.org/api/transaction.search',
|
||||||
api_token = 'api-grocgdq2767cx6v5ywckkjmetx2f',
|
api_token = 'api-grocgdq2767cx6v5ywckkjmetx2f',
|
||||||
sleep = 10,
|
sleep = 7,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
query all transactions for a task (task_phid).
|
query all transactions for a task (task_phid).
|
||||||
@ -125,17 +125,21 @@ def query_transactions_phid_task(
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# phab=Phabricator("https://phabricator.wikimedia.org/")
|
# phab=Phabricator("https://phabricator.wikimedia.org/")
|
||||||
tags = [
|
tags = [
|
||||||
"VisualEditor",
|
"Parsoid",
|
||||||
"Parsoid"
|
"https"
|
||||||
]
|
]
|
||||||
#set phabricator api token
|
#set phabricator api token
|
||||||
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
|
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
|
||||||
api_base = 'https://phabricator.wikimedia.org/api/'
|
api_base = 'https://phabricator.wikimedia.org/api/'
|
||||||
|
|
||||||
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
|
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2022, 1, 1, 0, 0, 0)))
|
||||||
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
|
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
|
||||||
|
if tag == "https":
|
||||||
|
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2013, 1, 1, 0, 0, 0)))
|
||||||
|
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2016, 12, 31, 0, 0, 0)))
|
||||||
|
|
||||||
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
|
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
|
||||||
|
|
||||||
for entry in p_data:
|
for entry in p_data:
|
||||||
@ -145,7 +149,7 @@ if __name__ == "__main__":
|
|||||||
comments = {}
|
comments = {}
|
||||||
for item in transactions:
|
for item in transactions:
|
||||||
comments[item['id']] = item['comments']
|
comments[item['id']] = item['comments']
|
||||||
entry['task_commeSigbjorn Finnents'] = comments
|
entry['task_comments'] = comments
|
||||||
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
|
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
|
||||||
with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
|
with open(f"{DATA_PREFIX}{tag}_phab_data.json", "w") as outfile1:
|
||||||
json.dump(p_data, outfile1)
|
json.dump(p_data, outfile1)
|
||||||
|
@ -436,7 +436,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "base",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
def main():
|
|
||||||
print('jkasdfjhksdjhksdfsd')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
Loading…
Reference in New Issue
Block a user