monitor pages from dsaez's wikidata crawler

This commit is contained in:
2020-04-04 15:23:33 -07:00
parent cfe21254d9
commit eae5464fd2
2 changed files with 135 additions and 188 deletions

142
wikipedia/scripts/fetch_revisions.py Executable file → Normal file
View File

@@ -11,48 +11,34 @@
import argparse
import logging
import os.path
from os import path, mkdir
import json
import datetime
from requests import Request
import sqlite3
from functools import partial
from itertools import chain
from csv import DictWriter
from mw import api
import digobs
def parse_args():
def main():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-d', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
args = parser.parse_args()
return(args)
def main():
args = parse_args()
logging = digobs.init_logging(args)
output_path = args.output_folder
article_filename = args.article_file
conn = args.input_db
conn.row_factory = sqlite3.Row
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
projects = (row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall())
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php")
tsv_fields = ['title', 'pageid', 'namespace']
# list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities)
@@ -68,81 +54,67 @@ def main():
'flags' : 'flags',
'comment' : 'comment',
'content' : 'content' }
def get_project_pages(project):
return (row['page'] for row in conn.execute(f"SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == '{project}';").fetchall())
def get_project_revisions(project):
pages = get_project_pages(project)
if project=="en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
return digobs.get_pages_revisions(pages, project=project, logging=logging, rv_props=rv_props)
tsv_fields = tsv_fields + list(rv_props.keys())
exclude_from_tsv = ['tags', 'comment', 'content', 'flags']
# load the list of articles
with open(article_filename, 'r') as infile:
article_list= list(map(str.strip, infile))
def get_revisions_for_page(title):
return api_session.revisions.query(properties=rv_props.values(),
titles={title},
direction="newer")
tsv_fields = ['title', 'pageid', 'namespace']
tsv_fields = tsv_fields + list(rv_props.keys())
# drop fields that we identified for exclusion
tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
# add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_time = str(datetime.datetime.now())
rev_batch_to_tsv = partial(digobs.rev_batch_to_tsv,
tsv_fields = tsv_fields,
export_info={'export_timestamp':export_time,
'export_commit':digobs.git_hash(short=True)})
export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time }
with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output:
export_date = datetime.datetime.today().strftime("%Y%m%d")
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader()
rev_batch_to_json = partial(digobs.rev_batch_to_json,
export_info = export_info)
for article in article_list:
logging.info(f"pulling revisions for: {article}")
for rev in get_revisions_for_page(article):
logging.debug(f"processing raw revision: {rev}")
def write_project_pages(project):
project_folder = path.join(args.output_folder, project)
if not path.exists(project_folder):
mkdir(project_folder)
# add export metadata
rev['exported'] = export_info
dump_folder = path.join(project_folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
# save the json version of the code
print(json.dumps(rev), file=json_output)
project_revs = get_project_revisions(project)
json_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.json")
tsv_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.tsv")
# handle missing data
if "sha1" not in rev:
rev["sha1"] = ""
with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output:
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader()
for rev_batch in project_revs:
logging.debug(f"processing raw revision: {rev_batch}")
rev_batch_to_json(rev_batch, json_output=json_output)
rev_batch_to_tsv(rev_batch, project=project, tsv_writer=tsv_writer)
if "userhidden" in rev:
rev["user"] = ""
rev["userid"] = ""
# recode anon so it's true or false instead of present/missing
if "anon" in rev:
rev["anon"] = True
else:
rev["anon"] = False
# let's recode "minor" in the same way
if "minor" in rev:
rev["minor"] = True
else:
rev["minor"] = False
# add page title information
rev['title'] = rev['page']['title']
rev['pageid'] = rev['page']['pageid']
rev['namespace'] = rev['page']['ns']
# construct a URL
rev['url'] = Request('GET', 'https://en.wikipedia.org/w/index.php',
params={'title' : rev['title'].replace(" ", "_"),
'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_time
rev['export_commit'] = digobs.git_hash(short=True)
tsv_writer.writerow({k: rev[k] for k in tsv_fields})
for project in projects:
write_project_pages(project)
if __name__ == "__main__":
main()