overdue backup

2025-02-14 12:58:27 -06:00 · 2025-02-14 12:58:27 -06:00 · 663862c7d8
commit 663862c7d8
parent 44cc0d0bb7
18 changed files with 736435 additions and 318 deletions
--- a/src/expand_dumps.ipynb
+++ b/src/expand_dumps.ipynb
--- a/src/helper_scripts/cleaning_scripts/0214_https_gerrit_test.csv
+++ b/src/helper_scripts/cleaning_scripts/0214_https_gerrit_test.csv
--- a/src/helper_scripts/cleaning_scripts/0214_ve_gerrit_test.csv
+++ b/src/helper_scripts/cleaning_scripts/0214_ve_gerrit_test.csv
--- a/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb
+++ b/src/helper_scripts/cleaning_scripts/cleaning_phabricator.ipynb
@ -4648,7 +4648,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -4691,7 +4691,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/src/helper_scripts/cleaning_scripts/gerrit_cleaning.ipynb
+++ b/src/helper_scripts/cleaning_scripts/gerrit_cleaning.ipynb
--- a/src/helper_scripts/existing_file_checking.py
+++ b/src/helper_scripts/existing_file_checking.py
--- a/src/helper_scripts/intermediary_script.py
+++ b/src/helper_scripts/intermediary_script.py
@ -0,0 +1,219 @@
+import git 
+from tqdm import tqdm
+import csv
+import os
+import shutil
+import time
+import pandas as pd
+import datetime
+import argparse
+
+'''
+RUNNING INSTRUCTIONS:
+[0] set up ssh-agent and add id_rsa to it
+[1] set up tmux environment
+[2] edit this file where marked "FIX BELOW"
+[3] install pip packages
+[4] in your tmux environment, run the following three commands
+    - os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
+    - os.environ['GIT_ASKPASS'] = 'false'
+    - os.environ['GIT_TERMINAL_PROMPT'] = '0'
+[5] in tmux, run the script as follows with your START and STOP values
+    - python3 intermediary_script.py --start_index START --stop_index STOP
+[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
+
+THANK YOU VERY MUCH - matt
+'''
+
+#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
+temp_dir = "/data/users/mgaughan/tmp3"
+cst = datetime.timezone(datetime.timedelta(hours=-6))
+from_date = datetime.datetime(2010, 1, 1, 12, 00, 00, tzinfo=cst)
+to_date = datetime.datetime(2024, 3, 16, 12, 00, 00, tzinfo=cst)
+#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
+COMMIT_SAVE_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/bot_frameworks/"
+
+def temp_clone(vcs_link, temp_location):
+    """
+    ARGS
+    vcs_link : url link to upstream repo vcs
+    temp_location : filepath to where the repo should be cloned to 
+
+    RETURNS
+    repo : the GitRepository object of the cloned repo 
+    repo_path : the filepath to the cloned repository
+    """
+    #print(temp_location)
+    vcs_link = vcs_link.strip()
+    os.makedirs(temp_location)
+    repo_path = temp_location
+    repo = git.Repo.clone_from(vcs_link, repo_path)
+    print(f"Successfully Cloned {vcs_link}")
+    return repo, repo_path
+
+
+def delete_clone(temp_location):
+    """
+    ARGS
+    temp_location : filepath to the cloned repository 
+
+    RETURNS
+    whether or not the deletion was a success
+    """
+    if os.path.exists(temp_location):
+        shutil.rmtree(temp_location)
+        print(f"{temp_location} has been deleted.")
+        return 0
+    else:
+        print("No clone at location")
+        return 1
+    
+# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
+def commit_analysis(repo, cutoff_date, start_date):
+    print("Analyzing Commits...")
+    commits_info = []
+    for commit in repo.iter_commits():
+        # if too far back, break
+        if commit.committed_datetime > start_date:
+            continue
+        if commit.committed_datetime < cutoff_date:
+            break
+        commit_info = {
+            "commit_hash": commit.hexsha,
+            "author_name": commit.author.name,
+            "author_email": commit.author.email,
+            "authored_date": commit.authored_datetime,
+            "committer_name": commit.committer.name,
+            "committer_email": commit.committer.email,
+            "commit_date": commit.committed_datetime,
+            "message": commit.message,
+            "is_merge": len(commit.parents) > 1,
+        }
+        # author/committer org information 
+        commit_info['author_org'] = commit_info["author_email"].split("@")[-1].split(".")[0]
+        commit_info['committer_org'] = commit_info["committer_email"].split("@")[-1].split(".")[0]
+        # some more effort to get this information
+        commit_info["branches"] = repo.git.branch(
+            "--contains", commit_info["commit_hash"]
+        )
+        # diff information
+        diffs = commit.diff(
+            commit.parents[0] if commit.parents else git.NULL_TREE, create_patch=True
+        )
+        commit_info["diff_info"] = diff_analysis(diffs)
+        # print(commit_info)
+        commits_info.append(commit_info)
+    return commits_info
+
+
+def diff_analysis(diffs):
+    diff_objects = []
+    for diff in diffs:
+        diff_info = {
+            "lines_added": sum(
+                1
+                for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
+                if line.startswith("+") and not line.startswith("+++")
+            ),
+            "lines_deleted": sum(
+                1
+                for line in diff.diff.decode("utf-8", errors="ignore").split("\n")
+                if line.startswith("-") and not line.startswith("---")
+            ),
+            "parent_filepath": diff.a_path,
+            "child_filepath": diff.b_path,
+            "change_type": diff.change_type,
+            "new_file": diff.new_file,
+            "deleted_file": diff.deleted_file,
+            "renamed_file": diff.renamed,
+            #'diff': diff.diff.decode('utf-8')
+        }
+        diff_objects.append(diff_info)
+    return diff_objects
+
+def for_all_files(start_index, stop_index):
+    cwd = os.getcwd()
+    #csv_path = "for_batching/deb_full_data.csv"
+    csv_path = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/mw-convo-collections/src/helper_scripts/frameworks_for_collection.csv"
+    index = -1
+    saved = []
+    empty_row = 0
+    clone_error =[]
+    has_readme = 0
+    has_contributing = 0
+    try: 
+        with open(csv_path, 'r') as file:
+            csv_reader = csv.DictReader(file) 
+            lines = [line for line in file]
+            for row in tqdm(csv.reader(lines), total=len(lines)):
+                index += 1
+                if index < start_index:
+                    continue
+                time.sleep(5)
+                if row[0] == "":
+                    empty_row += 1
+                    continue
+                #row[5] = upstream vcs
+                temp_repo_path = ""
+                und_repo_id = ""
+                try:
+                    os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
+                    os.environ['GIT_ASKPASS'] = 'false'
+                    os.environ['GIT_TERMINAL_PROMPT'] = '0'
+                    ssh_url = ""
+                    try: 
+                        if "github" in row[0]:
+                            repo_id = row[0][len('https://github.com/'):]
+                            ssh_url = f'git@github.com:{repo_id}.git'
+                            if ssh_url.endswith('.git.git'):
+                                ssh_url = ssh_url[:-4]
+                            temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
+                        else:
+                            parts = row[0].split('/')
+                            domain = parts[2]
+                            repo_id = '/'.join(parts[3:])
+                            try:
+                                temp_repo, temp_repo_path =  temp_clone(row[0], temp_dir)
+                            except Exception as e:
+                                print(f'non-Github cloning error, assuming HTTPS issue: {e}')
+                                delete_clone(temp_dir)
+                                ssh_url = f'git@{domain}:{repo_id}.git'
+                                if ssh_url.endswith('.git.git'):
+                                    ssh_url = ssh_url[:-4]
+                                temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
+                    except Exception as e:
+                        print(f'cloning error at {row[0]}')
+                        print(f'inside cloning error: {e}')
+                        raise ValueError(e)
+                    commits_array = commit_analysis(temp_repo, from_date, to_date)
+                    commits_df = pd.DataFrame.from_records(commits_array)
+                    und_repo_id = '_'.join(repo_id.split("/"))
+                    commits_df.to_csv(
+                    f"{COMMIT_SAVE_PREFIX}{und_repo_id}_commits.csv",
+                        index=False,
+                        )
+                except Exception as e:
+                    clone_error.append([row[0], str(e)])
+                    print(f"outside cloning error: {e}")
+                finally:
+                    und_repo_id = ""
+                    delete_clone(temp_dir)
+                    os.chdir(cwd)
+
+                if index == stop_index:
+                    break
+    except KeyboardInterrupt:
+        print("KeyBoardInterrrupt")
+    finally:
+        print(clone_error)
+        with open(f"s_{start_index}_{stop_index}-clone-error-output.txt", "w") as txt_file:
+                for error in clone_error:
+                    txt_file.write(', '.join(error) + "\n")
+        with open(f"s_{start_index}_{stop_index}-success-output.txt", "w") as txt_file:
+            txt_file.write(f"Number of Empty Rows: {empty_row} \n")
+            txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")
+            txt_file.write(f"Number that has README: {has_readme} \n")
+            txt_file.write(f"Number that has CONTRIBUTING: {has_contributing}")
+
+if __name__ == "__main__":
+    for_all_files(1, 30)
--- a/src/helper_scripts/repo_collection.py
+++ b/src/helper_scripts/repo_collection.py
@ -10,8 +10,8 @@ repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
 cst = datetime.timezone(datetime.timedelta(hours=-6))

 repos = {
-        "parsoid" : {
-            "url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
+        "wmf_config" : {
+            "url": "https://gerrit.wikimedia.org/r/operations/mediawiki-config",
            "from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
            "to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
            },
--- a/src/lib/expand_dumps.ipynb
+++ b/src/lib/expand_dumps.ipynb
@ -1,18 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "https://wikitech.wikimedia.org/wiki/Data_Platform/Data_Lake/Edits/Mediawiki_history_dumps/Python_Pandas_examples\n"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/src/lib/gerrit_get/gerrit_get.py
+++ b/src/lib/gerrit_get/gerrit_get.py
--- a/src/lib/gerrit_get/gerrit_query.sh
+++ b/src/lib/gerrit_get/gerrit_query.sh
@ -1,7 +1,7 @@
 #!/bin/bash
 
 API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
-QUERY_STRING="before:2013-03-29+visualeditor"
+QUERY_STRING="project:mediawiki/core+before:2013-03-30+HTTPS"

 API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"

--- a/src/lib/gerrit_get/https_core_response_gerrit.json
+++ b/src/lib/gerrit_get/https_core_response_gerrit.json
--- a/src/lib/gerrit_get/response.json
+++ b/src/lib/gerrit_get/response.json
--- a/src/lib/gerrit_get/ve_gerrit_response.json
+++ b/src/lib/gerrit_get/ve_gerrit_response.json
--- a/src/lib/phab_get.ipynb
+++ b/src/lib/phab_get.ipynb
--- a/src/lib/phab_get.py
+++ b/src/lib/phab_get.py
@ -16,7 +16,7 @@ def query_task_tag(
    ts1 = None, ts2 = None,
    api_url_base = 'https://phabricator.wikimedia.org/api/maniphest.search',
    api_token = "api-wurg254ciq5uvfxlr4rszn5ynpy4",
-    sleep = 10
+    sleep = 7
 ):
    '''
    query all tasks tagged with specific tag
@ -55,9 +55,9 @@ def query_task_tag(
        }

        response = requests.get( api_url_base, params=params)
-        print(response)
+        #print(response)
        result = json.loads(response.text)['result']
-        print(result)
+        #print(result)
        ## the data
        data_tmp = result['data']
        data += data_tmp
@ -79,7 +79,7 @@ def query_transactions_phid_task(
    limit = 100, 
    api_url_base = 'https://phabricator.wikimedia.org/api/transaction.search',
    api_token = 'api-grocgdq2767cx6v5ywckkjmetx2f',
-    sleep = 10, 
+    sleep = 7, 
 ):
    '''
    query all transactions for a task (task_phid).
@ -125,17 +125,21 @@ def query_transactions_phid_task(
 if __name__ == "__main__":
    # phab=Phabricator("https://phabricator.wikimedia.org/")
    tags = [
-        "VisualEditor",
-        "Parsoid"
+        "Parsoid",
+        "https"
    ]
    #set phabricator api token
    token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
    api_base = 'https://phabricator.wikimedia.org/api/'

-    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
+    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2022, 1, 1, 0, 0, 0)))
    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
    for tag in tags:

+        if tag == "https":
+            p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2013, 1, 1, 0, 0, 0)))
+            p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2016, 12, 31, 0, 0, 0)))            
+
        p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)

        for entry in p_data:
@ -145,7 +149,7 @@ if __name__ == "__main__":
            comments = {}
            for item in transactions:
                comments[item['id']] = item['comments']
-            entry['task_commeSigbjorn Finnents'] = comments
-        DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
-        with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
+            entry['task_comments'] = comments
+        DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
+        with open(f"{DATA_PREFIX}{tag}_phab_data.json", "w") as outfile1:
            json.dump(p_data, outfile1)
--- a/src/lib/spark-warehouse/bot_isolation.ipynb
+++ b/src/lib/spark-warehouse/bot_isolation.ipynb
@ -436,7 +436,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "base",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
--- a/src/main.py
+++ b/src/main.py
@ -1,6 +0,0 @@
-def main():
-    print('jkasdfjhksdjhksdfsd')
-
-
-if __name__ == "__main__":
-    main()