changes in response to code review by nate

- moved some common functions into files
- other smaller changes
This commit is contained in:
Benjamin Mako Hill 2020-04-01 17:16:34 -05:00
parent 4fe5deb013
commit 070d23f718
3 changed files with 37 additions and 36 deletions

View File

@ -0,0 +1,27 @@
#!/usr/bin/env python3
import sys
import subprocess
import logging
def git_hash(short=False):
if short:
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
else:
subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
def get_loglevel(arg_loglevel):
loglevel_mapping = { 'debug' : logging.DEBUG,
'info' : logging.INFO,
'warning' : logging.WARNING,
'error' : logging.ERROR,
'critical' : logging.CRITICAL }
if arg_loglevel in loglevel_mapping:
loglevel = loglevel_mapping[arg_loglevel]
return loglevel
else:
print("Choose a valid log level: debug, info, warning, error, or critical", file=sys.stderr)
return logging.INFO

View File

@ -10,16 +10,15 @@
############################################################################### ###############################################################################
import sys import sys
import subprocess
import requests import requests
import argparse import argparse
import json import json
import time import time
import os.path import os.path
import argparse
import datetime import datetime
import logging import logging
from csv import DictWriter from csv import DictWriter
import digobs
#import feather #TBD #import feather #TBD
def parse_args(): def parse_args():
@ -47,17 +46,7 @@ def main():
query_date = yesterday.strftime("%Y%m%d") query_date = yesterday.strftime("%Y%m%d")
#handle -L #handle -L
loglevel_mapping = { 'debug' : logging.DEBUG, loglevel = digobs.get_loglevel(args.logging_level)
'info' : logging.INFO,
'warning' : logging.WARNING,
'error' : logging.ERROR,
'critical' : logging.CRITICAL }
if args.logging_level in loglevel_mapping:
loglevel = loglevel_mapping[args.logging_level]
else:
print("Choose a valid log level: debug, info, warning, error, or critical")
exit
#handle -W #handle -W
if args.logging_destination: if args.logging_destination:
@ -65,20 +54,18 @@ def main():
else: else:
logging.basicConfig(level=loglevel) logging.basicConfig(level=loglevel)
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now()) export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d") export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}") logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}") logging.info(f"Last commit: {digobs.git_hash()}")
#1 Load up the list of article names #1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json") j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv") t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
with open(articleFile, 'r') as infile: with open(articleFile, 'r') as infile:
articleList = list(infile) articleList = list(map(str.strip, infile))
success = 0 #for logging how many work/fail success = 0 #for logging how many work/fail
failure = 0 failure = 0
@ -89,7 +76,6 @@ def main():
#2 Repeatedly call the API with that list of names #2 Repeatedly call the API with that list of names
for a in articleList: for a in articleList:
a = a.strip("\"\n") #destringify
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00" url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url) response = requests.get(url)

View File

@ -13,12 +13,12 @@ import argparse
import logging import logging
import os.path import os.path
import json import json
import subprocess
import datetime import datetime
from requests import Request from requests import Request
from csv import DictWriter from csv import DictWriter
from mw import api from mw import api
import digobs
def parse_args(): def parse_args():
@ -38,17 +38,7 @@ def main():
article_filename = args.article_file article_filename = args.article_file
#handle -L #handle -L
loglevel_mapping = { 'debug' : logging.DEBUG, loglevel = digobs.get_loglevel(args.logging_level)
'info' : logging.INFO,
'warning' : logging.WARNING,
'error' : logging.ERROR,
'critical' : logging.CRITICAL }
if args.logging_level in loglevel_mapping:
loglevel = loglevel_mapping[args.logging_level]
else:
print("Choose a valid log level: debug, info, warning, error, or critical")
exit
#handle -W #handle -W
if args.logging_destination: if args.logging_destination:
@ -56,13 +46,11 @@ def main():
else: else:
logging.basicConfig(level=loglevel) logging.basicConfig(level=loglevel)
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now()) export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d") export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}") logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}") logging.info(f"Last commit: {digobs.git_hash()}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json") json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv") tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
@ -88,7 +76,7 @@ def main():
# load the list of articles # load the list of articles
with open(article_filename, 'r') as infile: with open(article_filename, 'r') as infile:
article_list = [art.strip() for art in list(infile)] article_list= list(map(str.strip, infile))
def get_revisions_for_page(title): def get_revisions_for_page(title):
return api_session.revisions.query(properties=rv_props.values(), return api_session.revisions.query(properties=rv_props.values(),
@ -104,7 +92,7 @@ def main():
# add special export fields # add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit'] tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_info = { 'git_commit' : export_git_hash, export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time } 'timestamp' : export_time }
with open(json_output_filename, 'w') as json_output, \ with open(json_output_filename, 'w') as json_output, \
@ -155,7 +143,7 @@ def main():
'oldid' : rev['revid']}).prepare().url 'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_time rev['export_timestamp'] = export_time
rev['export_commit'] = export_git_short_hash rev['export_commit'] = digobs.git_hash(short=True)
tsv_writer.writerow({k: rev[k] for k in tsv_fields}) tsv_writer.writerow({k: rev[k] for k in tsv_fields})