1
0

misc data collection updates

This commit is contained in:
Matthew Gaughan 2025-02-12 11:48:46 -06:00
parent d2aca29ee2
commit 44cc0d0bb7
8 changed files with 169477 additions and 32 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
url
https://github.com/MW-Peachy/Peachy
https://github.com/addwiki/mediawiki-api-base
https://github.com/addwiki/mediawiki-api
https://github.com/hamstar/Wikimate
https://github.com/alexz-enwp/wikitools
https://github.com/Riamse/ceterach
https://github.com/eldur/jwbf
https://github.com/CXuesong/WikiClientLibrary
https://github.com/mer-c/wiki-java
https://github.com/greencardamom/BotWikiAwk
https://github.com/mwclient/mwclient
https://github.com/earwig/mwparserfromhell
https://github.com/barrust/mediawiki
https://github.com/ankostis/MatWiki
https://github.com/WPCleaner/wpcleaner
https://github.com/fastily/jwiki
https://github.com/wikimedia-gadgets/mock-mediawiki
https://github.com/kanasimi/wikiapi
https://github.com/FTB-Gamepedia/MediaWiki-Butt-Ruby
https://github.com/wikimedia/mediawiki-ruby-api
https://github.com/kenpratt/wikipedia-client
https://github.com/jpatokal/mediawiki-gateway
https://github.com/AccelerationNet/cl-mediawiki
1 url
2 https://github.com/MW-Peachy/Peachy
3 https://github.com/addwiki/mediawiki-api-base
4 https://github.com/addwiki/mediawiki-api
5 https://github.com/hamstar/Wikimate
6 https://github.com/alexz-enwp/wikitools
7 https://github.com/Riamse/ceterach
8 https://github.com/eldur/jwbf
9 https://github.com/CXuesong/WikiClientLibrary
10 https://github.com/mer-c/wiki-java
11 https://github.com/greencardamom/BotWikiAwk
12 https://github.com/mwclient/mwclient
13 https://github.com/earwig/mwparserfromhell
14 https://github.com/barrust/mediawiki
15 https://github.com/ankostis/MatWiki
16 https://github.com/WPCleaner/wpcleaner
17 https://github.com/fastily/jwiki
18 https://github.com/wikimedia-gadgets/mock-mediawiki
19 https://github.com/kanasimi/wikiapi
20 https://github.com/FTB-Gamepedia/MediaWiki-Butt-Ruby
21 https://github.com/wikimedia/mediawiki-ruby-api
22 https://github.com/kenpratt/wikipedia-client
23 https://github.com/jpatokal/mediawiki-gateway
24 https://github.com/AccelerationNet/cl-mediawiki

View File

@ -8,32 +8,28 @@ file_location = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/"
repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
cst = datetime.timezone(datetime.timedelta(hours=-6))
'''
repos = {
"core" : {
"url": "https://gerrit.wikimedia.org/r/mediawiki/core",
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
"parsoid" : {
"url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
},
"visualeditor": {
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2013, 5, 1, 00, 00, 00, tzinfo=cst)
}
}
}
'''
repos = {
"visualeditor": {
"parsoid" : {
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
}
}
}
'''
for repo in repos.keys():
print(repos[repo])
print(repos[repo]['url'])
print(repos[repo]["from_date"])
print(repos[repo]["to_date"])
repo_info = repos[repo]
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=True, csv_loc_prefix=file_location)
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=False, csv_loc_prefix=file_location)

58
src/lib/gerrit_get.py Normal file
View File

@ -0,0 +1,58 @@
#from phabricator import Phabricator
import os, sys
import json
import numpy as np
import pandas as pd
import requests
import re
import datetime
import time
from urllib.parse import quote_plus
from requests.auth import HTTPDigestAuth
from pygerrit2 import GerritRestAPI, HTTPBasicAuth
# GET /changes/?q=status:abandoned&q=before:{date}&q={TERM}
# GET https://gerrit.wikimedia.org/r/changes/?q=status:abandoned+visualeditor
def query_changes(
query_terms,
limit = 100,
api_url_base = 'https://gerrit.wikimedia.org/',
sleep = 10
):
time.sleep(sleep)
to_query = 1
after = None
data = []
while to_query == 1:
time.sleep(sleep)
joined_terms = "+".join(query_terms)
params = {
'q' : joined_terms,
}
api_url = f"{api_url_base}r/changes/?q={joined_terms}"
#add no-limit and API key somewhere
#auth = HTTPBasicAuth("ggonnemm", "1V6txZh5X+N3JpDMm5zqZM2M7ewA5D09g4ABOZAl5Q")
response = requests.get(api_url, headers={'Content-Type': 'application/json'})
result = json.load(response.text[5:])
#print(response.text)
## the data
data_tmp = result
data += data_tmp
print(data[-1])
## check if there are more results to query
break
return data
if __name__ == "__main__":
query_strings = ['before:2016-12-31', 'visualeditor']
results = query_changes(query_strings)
print(results)

8
src/lib/gerrit_query.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
QUERY_STRING="before:2013-03-29+visualeditor"
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
curl -X GET "$API_URL" -H "Accept: application/json" -o response.json

View File

@ -36,12 +36,17 @@ def query_task_tag(
data = []
# for bot frameworks
# listed on the help page as of 2-12-2024
# utilizing git as their VCS
while to_query == 1:
time.sleep(sleep)
params = {
'api.token' : api_token,
'constraints[query]':[tag_term], ## term that task is searched for with
#'constraints[projects]':["VisualEditor"], ## term that task is tagged with
# seemed to be artificially limiting the data that was returned, unrealistically low count values
#'constraints[projects]':[tag_term], ## term that task is tagged with
'constraints[createdStart]':ts1, ## timestamp task creation (min)
'constraints[createdEnd]':ts2, ## timestamp task creation (max)
'limit':limit,
@ -127,19 +132,20 @@ if __name__ == "__main__":
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
api_base = 'https://phabricator.wikimedia.org/api/'
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2024, 6, 10, 0, 0, 0)))
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 10, 10, 0, 0, 0)))
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
for tag in tags:
p_data = query_task_tag("Parsoid", ts1=p_ts1, ts2=p_ts2)
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
for entry in p_data:
task_id = entry['phid']
print(task_id)
transactions = query_transactions_phid_task(task_id)
comments = {}
for item in transactions:
comments[item['id']] = item['comments']
entry['task_comments'] = comments
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
with open(DATA_PREFIX + "parsoid/" + "2024_6_10_to_2024_10_10.json", "w") as outfile1:
json.dump(p_data, outfile1)
for entry in p_data:
task_id = entry['phid']
print(task_id)
transactions = query_transactions_phid_task(task_id)
comments = {}
for item in transactions:
comments[item['id']] = item['comments']
entry['task_commeSigbjorn Finnents'] = comments
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
json.dump(p_data, outfile1)

View File

@ -24,7 +24,7 @@ def toArray(str):
if __name__ == "__main__":
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/"
# Note: string unescaping and array conversion is done later
mediawiki_history_schema = StructType([
@ -200,4 +200,4 @@ if __name__ == "__main__":
sort(F.desc("day")). \
show(10, False)
activity_count_df.write.format("csv").save("012825_nonbot_single.csv")
activity_count_df.write.format("csv").save("020925_nonbot_yearly.csv")

164630
src/lib/ve_response.json Normal file

File diff suppressed because it is too large Load Diff