2020-04-01 20:14:05 +00:00
#!/usr/bin/env python3
2020-04-01 14:42:38 +00:00
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import argparse
import logging
2020-04-04 22:23:33 +00:00
from os import path , mkdir
2020-04-01 14:42:38 +00:00
import json
import datetime
2020-04-04 22:23:33 +00:00
import sqlite3
from functools import partial
from itertools import chain
2020-04-01 14:42:38 +00:00
from csv import DictWriter
2020-04-01 22:16:34 +00:00
import digobs
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
def main ( ) :
2020-04-01 14:42:38 +00:00
parser = argparse . ArgumentParser ( description = ' Call the views API to collect Wikipedia revision data. ' )
2020-04-01 14:51:20 +00:00
parser . add_argument ( ' -o ' , ' --output_folder ' , help = ' Where to save output ' , default = " wikipedia/data " , type = str )
2020-04-04 22:23:33 +00:00
parser . add_argument ( ' -i ' , ' --input_file ' , help = " Input a file of page names from the English Wikiproject. " , type = argparse . FileType ( ' r ' ) , default = ' ./wikipedia/resources/enwp_wikiproject_covid19_articles.txt ' )
parser . add_argument ( ' -d ' , ' --input_db ' , help = " Input a path to a sqlite3 database from the real-time-covid-tracker project " , type = sqlite3 . connect , default = ' real-time-wiki-covid-tracker/AllWikidataItems.sqlite ' )
2020-04-01 23:13:02 +00:00
parser . add_argument ( ' -L ' , ' --logging_level ' , help = ' Logging level. Options are debug, info, warning, error, critical. Default: info. ' , default = ' info ' , type = digobs . get_loglevel ) ,
2020-04-04 22:23:33 +00:00
parser . add_argument ( ' -W ' , ' --logging_destination ' , help = ' Logging destination file. (default: standard error) ' , type = argparse . FileType ( ' a ' ) )
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
args = parser . parse_args ( )
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
logging = digobs . init_logging ( args )
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
conn = args . input_db
conn . row_factory = sqlite3 . Row
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
projects = ( row [ ' project ' ] for row in conn . execute ( " SELECT DISTINCT project from pagesPerProjectTable; " ) . fetchall ( ) )
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
tsv_fields = [ ' title ' , ' pageid ' , ' namespace ' ]
2020-04-01 14:42:38 +00:00
# list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities)
rv_props = { ' revid ' : ' ids ' ,
' timestamp ' : ' timestamp ' ,
' user ' : ' user ' ,
' userid ' : ' userid ' ,
' size ' : ' size ' ,
' sha1 ' : ' sha1 ' ,
' contentmodel ' : ' contentmodel ' ,
' tags ' : ' tags ' ,
2020-04-01 21:39:53 +00:00
' flags ' : ' flags ' ,
2020-04-01 14:42:38 +00:00
' comment ' : ' comment ' ,
' content ' : ' content ' }
2020-04-04 22:23:33 +00:00
def get_project_pages ( project ) :
return ( row [ ' page ' ] for row in conn . execute ( f " SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == ' { project } ' ; " ) . fetchall ( ) )
2020-04-01 14:42:38 +00:00
2020-04-04 22:23:33 +00:00
def get_project_revisions ( project ) :
pages = get_project_pages ( project )
if project == " en.wikipedia " :
pages = chain ( pages , map ( str . strip , args . input_file ) )
return digobs . get_pages_revisions ( pages , project = project , logging = logging , rv_props = rv_props )
2020-04-01 14:42:38 +00:00
tsv_fields = tsv_fields + list ( rv_props . keys ( ) )
2020-04-04 22:23:33 +00:00
exclude_from_tsv = [ ' tags ' , ' comment ' , ' content ' , ' flags ' ]
2020-04-01 14:42:38 +00:00
# drop fields that we identified for exclusion
tsv_fields = [ e for e in tsv_fields if e not in exclude_from_tsv ]
# add special export fields
2020-04-01 21:39:53 +00:00
tsv_fields = tsv_fields + [ ' anon ' , ' minor ' , ' url ' , ' export_timestamp ' , ' export_commit ' ]
2020-04-04 22:23:33 +00:00
export_time = str ( datetime . datetime . now ( ) )
rev_batch_to_tsv = partial ( digobs . rev_batch_to_tsv ,
tsv_fields = tsv_fields ,
export_info = { ' export_timestamp ' : export_time ,
' export_commit ' : digobs . git_hash ( short = True ) } )
2020-04-01 14:42:38 +00:00
2020-04-01 22:16:34 +00:00
export_info = { ' git_commit ' : digobs . git_hash ( ) ,
2020-04-01 14:42:38 +00:00
' timestamp ' : export_time }
2020-04-04 22:23:33 +00:00
export_date = datetime . datetime . today ( ) . strftime ( " % Y % m %d " )
rev_batch_to_json = partial ( digobs . rev_batch_to_json ,
export_info = export_info )
def write_project_pages ( project ) :
project_folder = path . join ( args . output_folder , project )
if not path . exists ( project_folder ) :
mkdir ( project_folder )
dump_folder = path . join ( project_folder , export_date )
if not path . exists ( dump_folder ) :
mkdir ( dump_folder )
project_revs = get_project_revisions ( project )
json_output_filename = path . join ( dump_folder , f " digobs_covid19_ { project } _revisions- { export_date } .json " )
tsv_output_filename = path . join ( dump_folder , f " digobs_covid19_ { project } _revisions- { export_date } .tsv " )
with open ( json_output_filename , ' w ' ) as json_output , \
open ( tsv_output_filename , ' w ' ) as tsv_output :
tsv_writer = DictWriter ( tsv_output , fieldnames = tsv_fields , delimiter = " \t " )
tsv_writer . writeheader ( )
for rev_batch in project_revs :
logging . debug ( f " processing raw revision: { rev_batch } " )
rev_batch_to_json ( rev_batch , json_output = json_output )
rev_batch_to_tsv ( rev_batch , project = project , tsv_writer = tsv_writer )
for project in projects :
write_project_pages ( project )
2020-04-01 14:42:38 +00:00
if __name__ == " __main__ " :
main ( )