Initial commit
p# new file: runwikiq.sh
This commit is contained in:
158
userroles_scraper_scripts/userroles_from_logevents.py
Executable file
158
userroles_scraper_scripts/userroles_from_logevents.py
Executable file
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (C) 2018 Nathan TeBlunthuis
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import time, re, os
|
||||
import sys
|
||||
import requests
|
||||
from mw import api
|
||||
from pprint import pprint
|
||||
from json.decoder import JSONDecodeError
|
||||
from itertools import islice
|
||||
|
||||
def write_logevents(logevents,out):
|
||||
for logevent in logevents:
|
||||
# if there is hidden information, we skip this one because there
|
||||
# is nothing to report
|
||||
if 'userhidden' in logevent or 'actionhidden' in logevent or 'commenthidden' in logevent:
|
||||
continue
|
||||
|
||||
le_output = [logevent['comment'],
|
||||
str(logevent['logid']),
|
||||
str(logevent['ns']),
|
||||
str(logevent['pageid']),
|
||||
logevent['timestamp'],
|
||||
logevent['title'],
|
||||
logevent['type'],
|
||||
str(logevent['user'])]
|
||||
|
||||
if "params" in logevent:
|
||||
params = logevent["params"]
|
||||
else:
|
||||
params = {}
|
||||
|
||||
if "rights" in logevent:
|
||||
le_output.extend(['false',
|
||||
logevent['rights']['new'],
|
||||
logevent['rights']['old']])
|
||||
|
||||
|
||||
elif "newgroups" in params and "oldgroups" in params:
|
||||
le_output.extend(['false',
|
||||
','.join(params['newgroups']),
|
||||
','.join(params['oldgroups'])])
|
||||
else:
|
||||
le_output.extend(['true', '', ''])
|
||||
|
||||
out.write("\t".join(le_output) + "\n")
|
||||
out.flush()
|
||||
# output data
|
||||
|
||||
def get_events_for_wiki(wikiname, url, wikitype="wikia"):
|
||||
if url[-1] != '/':
|
||||
url = url + '/'
|
||||
|
||||
#out = open("../wikipedias/adminlist_output/logevents/nobackup/%s.tsv" % wikiname, "w")
|
||||
out = open("logevents-2017/%s.tsv" % wikiname, "w")
|
||||
out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp','title', 'type', 'user', 'ancient', 'rights-new', 'rights-old\n']))
|
||||
|
||||
if wikitype == "wikia":
|
||||
api_url = url + 'api.php'
|
||||
else: #wikitype == wikipedia
|
||||
api_url = url + "w/api.php"
|
||||
|
||||
query = {'action': 'query',
|
||||
'list': 'logevents',
|
||||
'letype' : 'rights',
|
||||
'lelimit' : '500',
|
||||
'format':'json',
|
||||
'ledir':'newer'}
|
||||
|
||||
response = requests.get(api_url, params=query)
|
||||
hit_url = response.url
|
||||
|
||||
if wikitype == "wikia":
|
||||
re_str = "^http://(community|www)\.wikia\.com/"
|
||||
if re.match(re_str, hit_url):
|
||||
# api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
|
||||
print("ERROR: %s no longer exists" % wikiname)
|
||||
return
|
||||
else:
|
||||
re_str = "^(http|https)://.*\.wikia.com/api\.php"
|
||||
if re.match(re_str, hit_url):
|
||||
try:
|
||||
## this is the only way out
|
||||
rv = response.json()
|
||||
## check that we hit the right wiki
|
||||
except (JSONDecodeError):
|
||||
print(" New Error! ")
|
||||
else:
|
||||
re_str = "^((http|https)://.*\.wikia\.com)"
|
||||
new_url = re.findall(re_str, hit_url)[0][0]
|
||||
return get_events_for_wiki(wikiname, new_url, wikitype=wikitype)
|
||||
|
||||
try:
|
||||
logevents = rv['query']['logevents']
|
||||
write_logevents(logevents, out)
|
||||
except KeyError as e:
|
||||
print("ERROR: %s contains no logevent data" % wikiname)
|
||||
print(e)
|
||||
return
|
||||
|
||||
while 'query-continue' in rv or 'continue' in rv:
|
||||
if 'query-continue' in rv:
|
||||
query['lestart'] = rv['query-continue']['logevents']['lestart']
|
||||
else:
|
||||
query['continue'] = str(rv['continue'])
|
||||
query['lecontinue'] = str(rv['continue']['lecontinue'])
|
||||
|
||||
response = requests.get(api_url,params=query)
|
||||
rv = response.json()
|
||||
logevents=rv['query']['logevents']
|
||||
write_logevents(logevents, out)
|
||||
|
||||
out.close()
|
||||
|
||||
files = [re.sub('\.tsv$', '', i) for i in os.listdir("logevents-2017")]
|
||||
|
||||
# interate through the list of wikis
|
||||
#for line in ["anime,http://anime.wikia.com/"]:
|
||||
#for line in ["blogging,http://blogging.wikia.com/"]:
|
||||
header = True
|
||||
if header:
|
||||
i = 1
|
||||
else:
|
||||
i = 0
|
||||
|
||||
# for line in open("list_of_wikis.csv", "r").readlines():
|
||||
for line in islice(open("../wikis.needing.userroles.csv", "r"),i,None):
|
||||
|
||||
(wiki, url) = line.split(",")
|
||||
url = url.strip()
|
||||
print("Processing wiki: %s" % wiki)
|
||||
|
||||
if wiki in files:
|
||||
print("SKIPPING: file \"%s\" already exists)" % wiki)
|
||||
continue
|
||||
|
||||
if "wikia.com" in url:
|
||||
wikitype = "wikia"
|
||||
else:# "wikipedia.org in url":
|
||||
wikitype = "wikipedia"
|
||||
|
||||
|
||||
get_events_for_wiki(wiki, url, wikitype=wikitype)
|
||||
time.sleep(1)
|
||||
Reference in New Issue
Block a user