misc data collection updates
This commit is contained in:
parent
d2aca29ee2
commit
44cc0d0bb7
4723
src/helper_scripts/cleaning_json.ipynb
Normal file
4723
src/helper_scripts/cleaning_json.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
24
src/helper_scripts/frameworks_for_collection.csv
Normal file
24
src/helper_scripts/frameworks_for_collection.csv
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
url
|
||||||
|
https://github.com/MW-Peachy/Peachy
|
||||||
|
https://github.com/addwiki/mediawiki-api-base
|
||||||
|
https://github.com/addwiki/mediawiki-api
|
||||||
|
https://github.com/hamstar/Wikimate
|
||||||
|
https://github.com/alexz-enwp/wikitools
|
||||||
|
https://github.com/Riamse/ceterach
|
||||||
|
https://github.com/eldur/jwbf
|
||||||
|
https://github.com/CXuesong/WikiClientLibrary
|
||||||
|
https://github.com/mer-c/wiki-java
|
||||||
|
https://github.com/greencardamom/BotWikiAwk
|
||||||
|
https://github.com/mwclient/mwclient
|
||||||
|
https://github.com/earwig/mwparserfromhell
|
||||||
|
https://github.com/barrust/mediawiki
|
||||||
|
https://github.com/ankostis/MatWiki
|
||||||
|
https://github.com/WPCleaner/wpcleaner
|
||||||
|
https://github.com/fastily/jwiki
|
||||||
|
https://github.com/wikimedia-gadgets/mock-mediawiki
|
||||||
|
https://github.com/kanasimi/wikiapi
|
||||||
|
https://github.com/FTB-Gamepedia/MediaWiki-Butt-Ruby
|
||||||
|
https://github.com/wikimedia/mediawiki-ruby-api
|
||||||
|
https://github.com/kenpratt/wikipedia-client
|
||||||
|
https://github.com/jpatokal/mediawiki-gateway
|
||||||
|
https://github.com/AccelerationNet/cl-mediawiki
|
|
@ -8,32 +8,28 @@ file_location = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/"
|
|||||||
repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
|
repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
|
||||||
|
|
||||||
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
||||||
'''
|
|
||||||
repos = {
|
repos = {
|
||||||
"core" : {
|
"parsoid" : {
|
||||||
"url": "https://gerrit.wikimedia.org/r/mediawiki/core",
|
"url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
|
||||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||||
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
|
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||||
},
|
},
|
||||||
"visualeditor": {
|
}
|
||||||
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
|
|
||||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
|
||||||
"to_date": datetime.datetime(2013, 5, 1, 00, 00, 00, tzinfo=cst)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
'''
|
'''
|
||||||
repos = {
|
"parsoid" : {
|
||||||
"visualeditor": {
|
|
||||||
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
|
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
|
||||||
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||||
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
for repo in repos.keys():
|
for repo in repos.keys():
|
||||||
print(repos[repo])
|
print(repos[repo])
|
||||||
print(repos[repo]['url'])
|
print(repos[repo]['url'])
|
||||||
print(repos[repo]["from_date"])
|
print(repos[repo]["from_date"])
|
||||||
print(repos[repo]["to_date"])
|
print(repos[repo]["to_date"])
|
||||||
repo_info = repos[repo]
|
repo_info = repos[repo]
|
||||||
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=True, csv_loc_prefix=file_location)
|
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=False, csv_loc_prefix=file_location)
|
||||||
|
|
||||||
|
58
src/lib/gerrit_get.py
Normal file
58
src/lib/gerrit_get.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#from phabricator import Phabricator
|
||||||
|
import os, sys
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
from requests.auth import HTTPDigestAuth
|
||||||
|
from pygerrit2 import GerritRestAPI, HTTPBasicAuth
|
||||||
|
|
||||||
|
# GET /changes/?q=status:abandoned&q=before:{date}&q={TERM}
|
||||||
|
# GET https://gerrit.wikimedia.org/r/changes/?q=status:abandoned+visualeditor
|
||||||
|
|
||||||
|
def query_changes(
|
||||||
|
query_terms,
|
||||||
|
limit = 100,
|
||||||
|
api_url_base = 'https://gerrit.wikimedia.org/',
|
||||||
|
sleep = 10
|
||||||
|
):
|
||||||
|
|
||||||
|
time.sleep(sleep)
|
||||||
|
to_query = 1
|
||||||
|
after = None
|
||||||
|
|
||||||
|
data = []
|
||||||
|
|
||||||
|
while to_query == 1:
|
||||||
|
time.sleep(sleep)
|
||||||
|
joined_terms = "+".join(query_terms)
|
||||||
|
params = {
|
||||||
|
'q' : joined_terms,
|
||||||
|
}
|
||||||
|
|
||||||
|
api_url = f"{api_url_base}r/changes/?q={joined_terms}"
|
||||||
|
#add no-limit and API key somewhere
|
||||||
|
|
||||||
|
#auth = HTTPBasicAuth("ggonnemm", "1V6txZh5X+N3JpDMm5zqZM2M7ewA5D09g4ABOZAl5Q")
|
||||||
|
response = requests.get(api_url, headers={'Content-Type': 'application/json'})
|
||||||
|
|
||||||
|
result = json.load(response.text[5:])
|
||||||
|
#print(response.text)
|
||||||
|
## the data
|
||||||
|
data_tmp = result
|
||||||
|
data += data_tmp
|
||||||
|
print(data[-1])
|
||||||
|
## check if there are more results to query
|
||||||
|
break
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
query_strings = ['before:2016-12-31', 'visualeditor']
|
||||||
|
results = query_changes(query_strings)
|
||||||
|
print(results)
|
8
src/lib/gerrit_query.sh
Executable file
8
src/lib/gerrit_query.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
|
||||||
|
QUERY_STRING="before:2013-03-29+visualeditor"
|
||||||
|
|
||||||
|
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
|
||||||
|
|
||||||
|
curl -X GET "$API_URL" -H "Accept: application/json" -o response.json
|
@ -36,12 +36,17 @@ def query_task_tag(
|
|||||||
|
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
|
# for bot frameworks
|
||||||
|
# listed on the help page as of 2-12-2024
|
||||||
|
# utilizing git as their VCS
|
||||||
|
|
||||||
while to_query == 1:
|
while to_query == 1:
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
params = {
|
params = {
|
||||||
'api.token' : api_token,
|
'api.token' : api_token,
|
||||||
'constraints[query]':[tag_term], ## term that task is searched for with
|
'constraints[query]':[tag_term], ## term that task is searched for with
|
||||||
#'constraints[projects]':["VisualEditor"], ## term that task is tagged with
|
# seemed to be artificially limiting the data that was returned, unrealistically low count values
|
||||||
|
#'constraints[projects]':[tag_term], ## term that task is tagged with
|
||||||
'constraints[createdStart]':ts1, ## timestamp task creation (min)
|
'constraints[createdStart]':ts1, ## timestamp task creation (min)
|
||||||
'constraints[createdEnd]':ts2, ## timestamp task creation (max)
|
'constraints[createdEnd]':ts2, ## timestamp task creation (max)
|
||||||
'limit':limit,
|
'limit':limit,
|
||||||
@ -127,10 +132,11 @@ if __name__ == "__main__":
|
|||||||
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
|
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
|
||||||
api_base = 'https://phabricator.wikimedia.org/api/'
|
api_base = 'https://phabricator.wikimedia.org/api/'
|
||||||
|
|
||||||
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2024, 6, 10, 0, 0, 0)))
|
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
|
||||||
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 10, 10, 0, 0, 0)))
|
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
|
||||||
|
for tag in tags:
|
||||||
|
|
||||||
p_data = query_task_tag("Parsoid", ts1=p_ts1, ts2=p_ts2)
|
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
|
||||||
|
|
||||||
for entry in p_data:
|
for entry in p_data:
|
||||||
task_id = entry['phid']
|
task_id = entry['phid']
|
||||||
@ -139,7 +145,7 @@ if __name__ == "__main__":
|
|||||||
comments = {}
|
comments = {}
|
||||||
for item in transactions:
|
for item in transactions:
|
||||||
comments[item['id']] = item['comments']
|
comments[item['id']] = item['comments']
|
||||||
entry['task_comments'] = comments
|
entry['task_commeSigbjorn Finnents'] = comments
|
||||||
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
|
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
|
||||||
with open(DATA_PREFIX + "parsoid/" + "2024_6_10_to_2024_10_10.json", "w") as outfile1:
|
with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
|
||||||
json.dump(p_data, outfile1)
|
json.dump(p_data, outfile1)
|
||||||
|
@ -24,7 +24,7 @@ def toArray(str):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
|
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/"
|
||||||
|
|
||||||
# Note: string unescaping and array conversion is done later
|
# Note: string unescaping and array conversion is done later
|
||||||
mediawiki_history_schema = StructType([
|
mediawiki_history_schema = StructType([
|
||||||
@ -200,4 +200,4 @@ if __name__ == "__main__":
|
|||||||
sort(F.desc("day")). \
|
sort(F.desc("day")). \
|
||||||
show(10, False)
|
show(10, False)
|
||||||
|
|
||||||
activity_count_df.write.format("csv").save("012825_nonbot_single.csv")
|
activity_count_df.write.format("csv").save("020925_nonbot_yearly.csv")
|
164630
src/lib/ve_response.json
Normal file
164630
src/lib/ve_response.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user