misc data collection updates
This commit is contained in:
parent
d2aca29ee2
commit
44cc0d0bb7
4723
src/helper_scripts/cleaning_json.ipynb
Normal file
4723
src/helper_scripts/cleaning_json.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
24
src/helper_scripts/frameworks_for_collection.csv
Normal file
24
src/helper_scripts/frameworks_for_collection.csv
Normal file
@ -0,0 +1,24 @@
|
||||
url
|
||||
https://github.com/MW-Peachy/Peachy
|
||||
https://github.com/addwiki/mediawiki-api-base
|
||||
https://github.com/addwiki/mediawiki-api
|
||||
https://github.com/hamstar/Wikimate
|
||||
https://github.com/alexz-enwp/wikitools
|
||||
https://github.com/Riamse/ceterach
|
||||
https://github.com/eldur/jwbf
|
||||
https://github.com/CXuesong/WikiClientLibrary
|
||||
https://github.com/mer-c/wiki-java
|
||||
https://github.com/greencardamom/BotWikiAwk
|
||||
https://github.com/mwclient/mwclient
|
||||
https://github.com/earwig/mwparserfromhell
|
||||
https://github.com/barrust/mediawiki
|
||||
https://github.com/ankostis/MatWiki
|
||||
https://github.com/WPCleaner/wpcleaner
|
||||
https://github.com/fastily/jwiki
|
||||
https://github.com/wikimedia-gadgets/mock-mediawiki
|
||||
https://github.com/kanasimi/wikiapi
|
||||
https://github.com/FTB-Gamepedia/MediaWiki-Butt-Ruby
|
||||
https://github.com/wikimedia/mediawiki-ruby-api
|
||||
https://github.com/kenpratt/wikipedia-client
|
||||
https://github.com/jpatokal/mediawiki-gateway
|
||||
https://github.com/AccelerationNet/cl-mediawiki
|
|
@ -8,32 +8,28 @@ file_location = "/data/users/mgaughan/mw-repo-lifecycles/commit_data/"
|
||||
repo_location = "/data/users/mgaughan/mw-repo-lifecycles/repo_artifacts/"
|
||||
|
||||
cst = datetime.timezone(datetime.timedelta(hours=-6))
|
||||
'''
|
||||
|
||||
repos = {
|
||||
"core" : {
|
||||
"url": "https://gerrit.wikimedia.org/r/mediawiki/core",
|
||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||
"parsoid" : {
|
||||
"url": "https://gerrit.wikimedia.org/r/mediawiki/services/parsoid",
|
||||
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||
},
|
||||
"visualeditor": {
|
||||
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
|
||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2013, 5, 1, 00, 00, 00, tzinfo=cst)
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
repos = {
|
||||
"visualeditor": {
|
||||
"parsoid" : {
|
||||
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
|
||||
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
|
||||
for repo in repos.keys():
|
||||
print(repos[repo])
|
||||
print(repos[repo]['url'])
|
||||
print(repos[repo]["from_date"])
|
||||
print(repos[repo]["to_date"])
|
||||
repo_info = repos[repo]
|
||||
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=True, csv_loc_prefix=file_location)
|
||||
repo_lifecycle(repo_info['url'], repo_location + repo + "/tmp", repo_info["from_date"], repo_info["to_date"], to_save=False, csv_loc_prefix=file_location)
|
||||
|
||||
|
58
src/lib/gerrit_get.py
Normal file
58
src/lib/gerrit_get.py
Normal file
@ -0,0 +1,58 @@
|
||||
#from phabricator import Phabricator
|
||||
import os, sys
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
import re
|
||||
import datetime
|
||||
import time
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from requests.auth import HTTPDigestAuth
|
||||
from pygerrit2 import GerritRestAPI, HTTPBasicAuth
|
||||
|
||||
# GET /changes/?q=status:abandoned&q=before:{date}&q={TERM}
|
||||
# GET https://gerrit.wikimedia.org/r/changes/?q=status:abandoned+visualeditor
|
||||
|
||||
def query_changes(
|
||||
query_terms,
|
||||
limit = 100,
|
||||
api_url_base = 'https://gerrit.wikimedia.org/',
|
||||
sleep = 10
|
||||
):
|
||||
|
||||
time.sleep(sleep)
|
||||
to_query = 1
|
||||
after = None
|
||||
|
||||
data = []
|
||||
|
||||
while to_query == 1:
|
||||
time.sleep(sleep)
|
||||
joined_terms = "+".join(query_terms)
|
||||
params = {
|
||||
'q' : joined_terms,
|
||||
}
|
||||
|
||||
api_url = f"{api_url_base}r/changes/?q={joined_terms}"
|
||||
#add no-limit and API key somewhere
|
||||
|
||||
#auth = HTTPBasicAuth("ggonnemm", "1V6txZh5X+N3JpDMm5zqZM2M7ewA5D09g4ABOZAl5Q")
|
||||
response = requests.get(api_url, headers={'Content-Type': 'application/json'})
|
||||
|
||||
result = json.load(response.text[5:])
|
||||
#print(response.text)
|
||||
## the data
|
||||
data_tmp = result
|
||||
data += data_tmp
|
||||
print(data[-1])
|
||||
## check if there are more results to query
|
||||
break
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
query_strings = ['before:2016-12-31', 'visualeditor']
|
||||
results = query_changes(query_strings)
|
||||
print(results)
|
8
src/lib/gerrit_query.sh
Executable file
8
src/lib/gerrit_query.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
API_URL_BASE="https://gerrit.wikimedia.org/r/changes"
|
||||
QUERY_STRING="before:2013-03-29+visualeditor"
|
||||
|
||||
API_URL="${API_URL_BASE}/?q=${QUERY_STRING}"
|
||||
|
||||
curl -X GET "$API_URL" -H "Accept: application/json" -o response.json
|
@ -36,12 +36,17 @@ def query_task_tag(
|
||||
|
||||
data = []
|
||||
|
||||
# for bot frameworks
|
||||
# listed on the help page as of 2-12-2024
|
||||
# utilizing git as their VCS
|
||||
|
||||
while to_query == 1:
|
||||
time.sleep(sleep)
|
||||
params = {
|
||||
'api.token' : api_token,
|
||||
'constraints[query]':[tag_term], ## term that task is searched for with
|
||||
#'constraints[projects]':["VisualEditor"], ## term that task is tagged with
|
||||
# seemed to be artificially limiting the data that was returned, unrealistically low count values
|
||||
#'constraints[projects]':[tag_term], ## term that task is tagged with
|
||||
'constraints[createdStart]':ts1, ## timestamp task creation (min)
|
||||
'constraints[createdEnd]':ts2, ## timestamp task creation (max)
|
||||
'limit':limit,
|
||||
@ -127,19 +132,20 @@ if __name__ == "__main__":
|
||||
token = "api-wurg254ciq5uvfxlr4rszn5ynpy4"
|
||||
api_base = 'https://phabricator.wikimedia.org/api/'
|
||||
|
||||
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2024, 6, 10, 0, 0, 0)))
|
||||
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 10, 10, 0, 0, 0)))
|
||||
p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2010, 1, 1, 0, 0, 0)))
|
||||
p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2024, 12, 31, 0, 0, 0)))
|
||||
for tag in tags:
|
||||
|
||||
p_data = query_task_tag("Parsoid", ts1=p_ts1, ts2=p_ts2)
|
||||
p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
|
||||
|
||||
for entry in p_data:
|
||||
task_id = entry['phid']
|
||||
print(task_id)
|
||||
transactions = query_transactions_phid_task(task_id)
|
||||
comments = {}
|
||||
for item in transactions:
|
||||
comments[item['id']] = item['comments']
|
||||
entry['task_comments'] = comments
|
||||
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
|
||||
with open(DATA_PREFIX + "parsoid/" + "2024_6_10_to_2024_10_10.json", "w") as outfile1:
|
||||
json.dump(p_data, outfile1)
|
||||
for entry in p_data:
|
||||
task_id = entry['phid']
|
||||
print(task_id)
|
||||
transactions = query_transactions_phid_task(task_id)
|
||||
comments = {}
|
||||
for item in transactions:
|
||||
comments[item['id']] = item['comments']
|
||||
entry['task_commeSigbjorn Finnents'] = comments
|
||||
DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/" + tag
|
||||
with open(DATA_PREFIX + "/" + "2010_1_1_to_2024_12_31.json", "w") as outfile1:
|
||||
json.dump(p_data, outfile1)
|
||||
|
@ -24,7 +24,7 @@ def toArray(str):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
|
||||
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/"
|
||||
|
||||
# Note: string unescaping and array conversion is done later
|
||||
mediawiki_history_schema = StructType([
|
||||
@ -200,4 +200,4 @@ if __name__ == "__main__":
|
||||
sort(F.desc("day")). \
|
||||
show(10, False)
|
||||
|
||||
activity_count_df.write.format("csv").save("012825_nonbot_single.csv")
|
||||
activity_count_df.write.format("csv").save("020925_nonbot_yearly.csv")
|
164630
src/lib/ve_response.json
Normal file
164630
src/lib/ve_response.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user