monitor pages from dsaez's wikidata crawler

This commit is contained in:
Nathan TeBlunthuis 2020-04-04 15:23:33 -07:00
parent cfe21254d9
commit eae5464fd2
2 changed files with 135 additions and 188 deletions

181
wikipedia/scripts/fetch_daily_views.py Executable file → Normal file
View File

@ -1,110 +1,85 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import sys
import requests
import argparse import argparse
import json import sqlite3
import time import requests
import os.path from datetime import datetime, timedelta
import datetime
import logging import logging
from csv import DictWriter
import digobs import digobs
#import feather #TBD from os import path, mkdir
from functools import partial
def parse_args(): from itertools import chain
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
outputPath = args.output_folder
articleFile = args.article_file
#handle -d
if args.query_date:
query_date = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
#1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.json")
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.tsv")
with open(articleFile, 'r') as infile:
articleList = list(map(str.strip, infile))
success = 0 #for logging how many work/fail
failure = 0
#3 Save results as a JSON and TSV
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
#2 Repeatedly call the API with that list of names
for a in articleList:
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url)
if response.ok:
jd = response.json()["items"][0]
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
continue
# start writing the CSV File if it doesn't exist yet
try:
dw
except NameError:
dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
dw.writeheader()
logging.debug(f"printing data: {jd}")
# write out the line of the json file
print(json.dumps(jd), file=j_outfile)
# write out of the csv file
dw.writerow(jd)
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
if __name__ == "__main__": if __name__ == "__main__":
main() parser = argparse.ArgumentParser(description="Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities.")
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel)
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=lambda s: datetime.strptime(s, "%Y%m%d"))
parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-b', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
args = parser.parse_args()
conn = args.input_db
conn.row_factory = sqlite3.Row
#handle -d
if args.query_date:
query_date = args.query_date.strftime("%Y%m%d")
else:
yesterday = datetime.today() - timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
digobs.init_logging(args)
logging.info(f"Destructively outputting results to {args.output_folder}")
#1 Load up the list of article names
logging.info("loading info from database")
projects = [row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall()]
successes = 0
failures = 0
for project in projects:
project_folder = path.join(args.output_folder, project)
if not path.exists(project.folder):
mkdir(project_folder)
dump_folder = path.join(projct.folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
logging.info(f"Getting page views for {project}")
rows = conn.execute(f"SELECT DISTINCT page from pagesPerProjectTable WHERE project='{project}';").fetchall()
pages = (row['page'] for row in rows)
# special case for english, we have a wikiproject input file
if project == "en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
call_view_api = partial(digobs.call_view_api, project=project, query_date = query_date)
responses = map(call_view_api, pages)
j_outfilename = path.join(j_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.json")
t_outfilename = path.join(t_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.tsv")
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
proj_successes, proj_failures = digobs.process_view_responses(responses, j_outfile, t_outfile, logging)
logging.info(f"(Processed {proj_successes} successes and {proj_failures} for {project}")
successes = proj_successes + successes
failures = proj_failures + failures
conn.close()
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.now()}")
logging.info(f"Processed {successes} successful URLs and {failures} failures.")

138
wikipedia/scripts/fetch_revisions.py Executable file → Normal file
View File

@ -11,48 +11,34 @@
import argparse import argparse
import logging import logging
import os.path from os import path, mkdir
import json import json
import datetime import datetime
import sqlite3
from requests import Request from functools import partial
from itertools import chain
from csv import DictWriter from csv import DictWriter
from mw import api
import digobs import digobs
def main():
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.') parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str) parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str) parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-d', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel), parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str), parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
args = parser.parse_args() args = parser.parse_args()
return(args)
def main(): logging = digobs.init_logging(args)
args = parse_args()
output_path = args.output_folder conn = args.input_db
article_filename = args.article_file conn.row_factory = sqlite3.Row
#handle -W projects = (row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall())
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now()) tsv_fields = ['title', 'pageid', 'namespace']
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php")
# list of properties from the API we want to gather (basically all of # list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities) # them supported by mediawik-utilities)
@ -69,80 +55,66 @@ def main():
'comment' : 'comment', 'comment' : 'comment',
'content' : 'content' } 'content' : 'content' }
exclude_from_tsv = ['tags', 'comment', 'content', 'flags'] def get_project_pages(project):
return (row['page'] for row in conn.execute(f"SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == '{project}';").fetchall())
# load the list of articles def get_project_revisions(project):
with open(article_filename, 'r') as infile: pages = get_project_pages(project)
article_list= list(map(str.strip, infile)) if project=="en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
return digobs.get_pages_revisions(pages, project=project, logging=logging, rv_props=rv_props)
def get_revisions_for_page(title):
return api_session.revisions.query(properties=rv_props.values(),
titles={title},
direction="newer")
tsv_fields = ['title', 'pageid', 'namespace']
tsv_fields = tsv_fields + list(rv_props.keys()) tsv_fields = tsv_fields + list(rv_props.keys())
exclude_from_tsv = ['tags', 'comment', 'content', 'flags']
# drop fields that we identified for exclusion # drop fields that we identified for exclusion
tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv] tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
# add special export fields # add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit'] tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_time = str(datetime.datetime.now())
rev_batch_to_tsv = partial(digobs.rev_batch_to_tsv,
tsv_fields = tsv_fields,
export_info={'export_timestamp':export_time,
'export_commit':digobs.git_hash(short=True)})
export_info = { 'git_commit' : digobs.git_hash(), export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time } 'timestamp' : export_time }
export_date = datetime.datetime.today().strftime("%Y%m%d")
rev_batch_to_json = partial(digobs.rev_batch_to_json,
export_info = export_info)
def write_project_pages(project):
project_folder = path.join(args.output_folder, project)
if not path.exists(project_folder):
mkdir(project_folder)
dump_folder = path.join(project_folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
project_revs = get_project_revisions(project)
json_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.json")
tsv_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.tsv")
with open(json_output_filename, 'w') as json_output, \ with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output: open(tsv_output_filename, 'w') as tsv_output:
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t") tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader() tsv_writer.writeheader()
for article in article_list: for rev_batch in project_revs:
logging.info(f"pulling revisions for: {article}") logging.debug(f"processing raw revision: {rev_batch}")
for rev in get_revisions_for_page(article): rev_batch_to_json(rev_batch, json_output=json_output)
logging.debug(f"processing raw revision: {rev}") rev_batch_to_tsv(rev_batch, project=project, tsv_writer=tsv_writer)
# add export metadata for project in projects:
rev['exported'] = export_info write_project_pages(project)
# save the json version of the code
print(json.dumps(rev), file=json_output)
# handle missing data
if "sha1" not in rev:
rev["sha1"] = ""
if "userhidden" in rev:
rev["user"] = ""
rev["userid"] = ""
# recode anon so it's true or false instead of present/missing
if "anon" in rev:
rev["anon"] = True
else:
rev["anon"] = False
# let's recode "minor" in the same way
if "minor" in rev:
rev["minor"] = True
else:
rev["minor"] = False
# add page title information
rev['title'] = rev['page']['title']
rev['pageid'] = rev['page']['pageid']
rev['namespace'] = rev['page']['ns']
# construct a URL
rev['url'] = Request('GET', 'https://en.wikipedia.org/w/index.php',
params={'title' : rev['title'].replace(" ", "_"),
'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_time
rev['export_commit'] = digobs.git_hash(short=True)
tsv_writer.writerow({k: rev[k] for k in tsv_fields})
if __name__ == "__main__": if __name__ == "__main__":
main() main()