2020-03-27 23:00:36 +00:00
#!/usr/bin/env python3
import argparse
2020-04-04 22:23:33 +00:00
import sqlite3
import requests
from datetime import datetime , timedelta
2020-03-29 01:46:35 +00:00
import logging
2020-04-01 22:16:34 +00:00
import digobs
2020-04-04 22:23:33 +00:00
from os import path , mkdir
from functools import partial
from itertools import chain
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities. " )
2020-04-01 14:51:20 +00:00
parser . add_argument ( ' -o ' , ' --output_folder ' , help = ' Where to save output ' , default = " wikipedia/data " , type = str )
2020-04-04 22:23:33 +00:00
parser . add_argument ( ' -L ' , ' --logging_level ' , help = ' Logging level. Options are debug, info, warning, error, critical. Default: info. ' , default = ' info ' , type = digobs . get_loglevel )
parser . add_argument ( ' -W ' , ' --logging_destination ' , help = ' Logging destination file. (default: standard error) ' , type = argparse . FileType ( ' a ' ) )
parser . add_argument ( ' -d ' , ' --query_date ' , help = ' Date if not yesterday, in YYYYMMDD format. ' , type = lambda s : datetime . strptime ( s , " % Y % m %d " ) )
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
parser . add_argument ( ' -i ' , ' --input_file ' , help = " Input a file of page names from the English Wikiproject. " , type = argparse . FileType ( ' r ' ) , default = ' ./wikipedia/resources/enwp_wikiproject_covid19_articles.txt ' )
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
parser . add_argument ( ' -b ' , ' --input_db ' , help = " Input a path to a sqlite3 database from the real-time-covid-tracker project " , type = sqlite3 . connect , default = ' real-time-wiki-covid-tracker/AllWikidataItems.sqlite ' )
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
args = parser . parse_args ( )
conn = args . input_db
conn . row_factory = sqlite3 . Row
2020-03-28 00:24:18 +00:00
2020-03-29 01:46:35 +00:00
#handle -d
2020-04-01 14:15:12 +00:00
if args . query_date :
2020-04-04 22:23:33 +00:00
query_date = args . query_date . strftime ( " % Y % m %d " )
2020-03-27 23:00:36 +00:00
else :
2020-04-04 22:23:33 +00:00
yesterday = datetime . today ( ) - timedelta ( days = 1 )
2020-04-01 20:14:05 +00:00
query_date = yesterday . strftime ( " % Y % m %d " )
2020-03-28 00:24:18 +00:00
2020-04-04 22:23:33 +00:00
digobs . init_logging ( args )
2020-03-29 01:46:35 +00:00
2020-04-04 22:23:33 +00:00
logging . info ( f " Destructively outputting results to { args . output_folder } " )
2020-03-27 23:00:36 +00:00
2020-04-01 14:15:12 +00:00
#1 Load up the list of article names
2020-03-28 01:08:43 +00:00
2020-04-04 22:23:33 +00:00
logging . info ( " loading info from database " )
projects = [ row [ ' project ' ] for row in conn . execute ( " SELECT DISTINCT project from pagesPerProjectTable; " ) . fetchall ( ) ]
successes = 0
failures = 0
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
for project in projects :
project_folder = path . join ( args . output_folder , project )
if not path . exists ( project . folder ) :
mkdir ( project_folder )
2020-03-28 21:12:36 +00:00
2020-04-04 22:23:33 +00:00
dump_folder = path . join ( projct . folder , export_date )
if not path . exists ( dump_folder ) :
mkdir ( dump_folder )
2020-03-28 21:12:36 +00:00
2020-04-04 22:23:33 +00:00
logging . info ( f " Getting page views for { project } " )
rows = conn . execute ( f " SELECT DISTINCT page from pagesPerProjectTable WHERE project= ' { project } ' ; " ) . fetchall ( )
pages = ( row [ ' page ' ] for row in rows )
2020-03-28 21:12:36 +00:00
2020-04-04 22:23:33 +00:00
# special case for english, we have a wikiproject input file
if project == " en.wikipedia " :
pages = chain ( pages , map ( str . strip , args . input_file ) )
2020-03-28 21:12:36 +00:00
2020-04-04 22:23:33 +00:00
call_view_api = partial ( digobs . call_view_api , project = project , query_date = query_date )
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
responses = map ( call_view_api , pages )
j_outfilename = path . join ( j_output_folder , f " digobs_covid19_ { project } _dailyviews- { query_date } .json " )
t_outfilename = path . join ( t_output_folder , f " digobs_covid19_ { project } _dailyviews- { query_date } .tsv " )
2020-03-29 01:46:35 +00:00
2020-04-04 22:23:33 +00:00
with open ( j_outfilename , ' w ' ) as j_outfile , \
open ( t_outfilename , ' w ' ) as t_outfile :
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
proj_successes , proj_failures = digobs . process_view_responses ( responses , j_outfile , t_outfile , logging )
logging . info ( f " (Processed { proj_successes } successes and { proj_failures } for { project } " )
successes = proj_successes + successes
failures = proj_failures + failures
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00
conn . close ( )
2020-04-01 20:14:05 +00:00
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
2020-03-28 03:27:02 +00:00
# read the json back in and make a feather file?
2020-04-04 22:23:33 +00:00
logging . debug ( f " Run complete at { datetime . now ( ) } " )
logging . info ( f " Processed { successes } successful URLs and { failures } failures. " )
2020-03-27 23:00:36 +00:00
2020-04-04 22:23:33 +00:00