misc data collection updates

2025-02-12 11:48:46 -06:00 · 2025-02-12 11:48:46 -06:00 · 44cc0d0bb7
commit 44cc0d0bb7
parent d2aca29ee2
8 changed files with 169477 additions and 32 deletions
--- a/src/helper_scripts/cleaning_json.ipynb
+++ b/src/helper_scripts/cleaning_json.ipynb
--- a/src/helper_scripts/frameworks_for_collection.csv
+++ b/src/helper_scripts/frameworks_for_collection.csv
@ -0,0 +1,24 @@
 url
 https://github.com/MW-Peachy/Peachy
 https://github.com/addwiki/mediawiki-api-base
 https://github.com/addwiki/mediawiki-api
 https://github.com/hamstar/Wikimate
 https://github.com/alexz-enwp/wikitools
 https://github.com/Riamse/ceterach
 https://github.com/eldur/jwbf
 https://github.com/CXuesong/WikiClientLibrary
 https://github.com/mer-c/wiki-java
 https://github.com/greencardamom/BotWikiAwk
 https://github.com/mwclient/mwclient
 https://github.com/earwig/mwparserfromhell
 https://github.com/barrust/mediawiki
 https://github.com/ankostis/MatWiki
 https://github.com/WPCleaner/wpcleaner
 https://github.com/fastily/jwiki
 https://github.com/wikimedia-gadgets/mock-mediawiki
 https://github.com/kanasimi/wikiapi
 https://github.com/FTB-Gamepedia/MediaWiki-Butt-Ruby
 https://github.com/wikimedia/mediawiki-ruby-api
 https://github.com/kenpratt/wikipedia-client
 https://github.com/jpatokal/mediawiki-gateway
 https://github.com/AccelerationNet/cl-mediawiki
--- a/src/helper_scripts/repo_collection.py
+++ b/src/helper_scripts/repo_collection.py
@ -8,32 +8,28 @@ file_location = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/"
 repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
 cst = datetime.timezone(datetime.timedelta(hours=-6))
-'''
+
 repos = {
-        "core" : {
+        "parsoid" : {
-            "url": "https://gerrit.wikimedia.org/r/mediawiki/core",
+            "url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
-            "from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
+            "from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
-            "to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
+            "to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
            },
-        "visualeditor": {
+}
            "url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
            "from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
            "to_date": datetime.datetime(2013, 5, 1, 00, 00, 00, tzinfo=cst)
            }
        }
 '''
-repos = {
+        "parsoid" : {
        "visualeditor": {
            "url": "https://gerrit.wikimedia.org/r/pywikibot/core",
            "from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
            "to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
            }
-}
+        }
 '''
 for repo in repos.keys():
    print(repos[repo])
    print(repos[repo]['url'])
    print(repos[repo]["from_date"])
    print(repos[repo]["to_date"])
    repo_info = repos[repo]
-    repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=True, csv_loc_prefix=file_location)
+    repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=False, csv_loc_prefix=file_location)
--- a/src/lib/gerrit_get.py
+++ b/src/lib/gerrit_get.py
@ -0,0 +1,58 @@
 #from phabricator import Phabricator
 import os, sys
 import json
 import numpy as np
 import pandas as pd
 import requests
 import re
 import datetime
 import time
 from urllib.parse import quote_plus
 from requests.auth import HTTPDigestAuth
 from pygerrit2 import GerritRestAPI, HTTPBasicAuth
 # GET /changes/?q=status:abandoned&q=before:{date}&q={TERM}
 # GET https://gerrit.wikimedia.org/r/changes/?q=status:abandoned+visualeditor
 def query_changes(
    query_terms, 
    limit = 100, 
    api_url_base = 'https://gerrit.wikimedia.org/',
    sleep = 10
 ):
    time.sleep(sleep)
    to_query = 1
    after = None
    data = []
    while to_query == 1:
        time.sleep(sleep)
        joined_terms = "+".join(query_terms)
        params = {
            'q'  : joined_terms,
        }
        api_url = f"{api_url_base}r/changes/?q={joined_terms}"
        #add no-limit and API key somewhere
        #auth = HTTPBasicAuth("ggonnemm", "1V6txZh5X+N3JpDMm5zqZM2M7ewA5D09g4ABOZAl5Q")
        response = requests.get(api_url, headers={'Content-Type': 'application/json'})
        result = json.load(response.text[5:])
        #print(response.text)
        ## the data
        data_tmp = result
        data += data_tmp
        print(data[-1])
        ## check if there are more results to query
        break
    return data
 if __name__ == "__main__":
    query_strings = ['before:2016-12-31', 'visualeditor']
    results = query_changes(query_strings)
    print(results)
--- a/src/lib/gerrit_query.sh
+++ b/src/lib/gerrit_query.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
 QUERY_STRING="before:2013-03-29+visualeditor"
 API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
 curl -X GET "$API_URL" -H "Accept: application/json" -o response.json
--- a/src/lib/phab_get.py
+++ b/src/lib/phab_get.py
@ -36,12 +36,17 @@ def query_task_tag(
    data = []
    # for bot frameworks
    # listed on the help page as of 2-12-2024
    # utilizing git as their VCS
    while to_query == 1:
        time.sleep(sleep)
        params = {
            'api.token'  : api_token,
            'constraints[query]':[tag_term], ## term that task is searched for with
-            #'constraints[projects]':["VisualEditor"], ## term that task is tagged with
+            # seemed to be artificially limiting the data that was returned, unrealistically low count values
            #'constraints[projects]':[tag_term], ## term that task is tagged with
            'constraints[createdStart]':ts1, ## timestamp task creation (min)
            'constraints[createdEnd]':ts2, ## timestamp task creation (max)
            'limit':limit,
@ -127,10 +132,11 @@ if __name__ == "__main__":
    token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
    api_base = 'https://phabricator.wikimedia.org/api/'
-    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2024, 6, 10, 0, 0, 0)))
+    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
-    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 10, 10, 0, 0, 0)))
+    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
    for tag in tags:
-    p_data = query_task_tag("Parsoid", ts1=p_ts1, ts2=p_ts2)
+        p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
        for entry in p_data:
            task_id = entry['phid']
@ -139,7 +145,7 @@ if __name__ == "__main__":
            comments = {}
            for item in transactions:
                comments[item['id']] = item['comments']
-        entry['task_comments'] = comments
+            entry['task_commeSigbjorn Finnents'] = comments
-    DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
+        DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
-    with open(DATA_PREFIX + "parsoid/" + "2024_6_10_to_2024_10_10.json", "w") as outfile1:
+        with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
            json.dump(p_data, outfile1)
--- a/src/lib/spark-warehouse/activity_isolation.py
+++ b/src/lib/spark-warehouse/activity_isolation.py
@ -24,7 +24,7 @@ def toArray(str):
 if __name__ == "__main__":
-    mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
+    mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/"
    # Note: string unescaping and array conversion is done later
    mediawiki_history_schema = StructType([
@ -200,4 +200,4 @@ if __name__ == "__main__":
        sort(F.desc("day")). \
        show(10, False)
-    activity_count_df.write.format("csv").save("012825_nonbot_single.csv")
+    activity_count_df.write.format("csv").save("020925_nonbot_yearly.csv")
--- a/src/lib/ve_response.json
+++ b/src/lib/ve_response.json