covid19/wikipedia/scripts/fetch_daily_views.py

86 lines
3.7 KiB
Python

#!/usr/bin/env python3
import argparse
import sqlite3
import requests
from datetime import datetime, timedelta
import logging
import digobs
from os import path, mkdir
from functools import partial
from itertools import chain
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities.")
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel)
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=lambda s: datetime.strptime(s, "%Y%m%d"))
parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-b', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
args = parser.parse_args()
conn = args.input_db
conn.row_factory = sqlite3.Row
#handle -d
if args.query_date:
query_date = args.query_date.strftime("%Y%m%d")
else:
yesterday = datetime.today() - timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
digobs.init_logging(args)
logging.info(f"Destructively outputting results to {args.output_folder}")
#1 Load up the list of article names
logging.info("loading info from database")
projects = [row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall()]
successes = 0
failures = 0
for project in projects:
project_folder = path.join(args.output_folder, project)
if not path.exists(project.folder):
mkdir(project_folder)
dump_folder = path.join(projct.folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
logging.info(f"Getting page views for {project}")
rows = conn.execute(f"SELECT DISTINCT page from pagesPerProjectTable WHERE project='{project}';").fetchall()
pages = (row['page'] for row in rows)
# special case for english, we have a wikiproject input file
if project == "en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
call_view_api = partial(digobs.call_view_api, project=project, query_date = query_date)
responses = map(call_view_api, pages)
j_outfilename = path.join(j_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.json")
t_outfilename = path.join(t_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.tsv")
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
proj_successes, proj_failures = digobs.process_view_responses(responses, j_outfile, t_outfile, logging)
logging.info(f"(Processed {proj_successes} successes and {proj_failures} for {project}")
successes = proj_successes + successes
failures = proj_failures + failures
conn.close()
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.now()}")
logging.info(f"Processed {successes} successful URLs and {failures} failures.")