address confusion with date

The timestamps in files should be the day that the exports are done. For
the view data, the query date needs to be the day before but this
shouldn't be the timestamp we use in files, etc.
This commit is contained in:
Benjamin Mako Hill 2020-04-01 15:14:05 -05:00
parent 06d2fd1563
commit 04e00f363b
2 changed files with 11 additions and 17 deletions

View File

@ -23,7 +23,6 @@ from csv import DictWriter
#import feather #TBD #import feather #TBD
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.') parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str) parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
@ -42,10 +41,10 @@ def main():
#handle -d #handle -d
if args.query_date: if args.query_date:
queryDate = args.query_date query_date = args.query_date
else: else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1) yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
queryDate = yesterday.strftime("%Y%m%d") query_date = yesterday.strftime("%Y%m%d")
#handle -L #handle -L
loglevel_mapping = { 'debug' : logging.DEBUG, loglevel_mapping = { 'debug' : logging.DEBUG,
@ -69,13 +68,14 @@ def main():
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip() export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now()) export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}") logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}") logging.info(f"Last commit: {export_git_hash}")
#1 Load up the list of article names #1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json") j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv") t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
with open(articleFile, 'r') as infile: with open(articleFile, 'r') as infile:
articleList = list(infile) articleList = list(infile)
@ -90,7 +90,7 @@ def main():
#2 Repeatedly call the API with that list of names #2 Repeatedly call the API with that list of names
for a in articleList: for a in articleList:
a = a.strip("\"\n") #destringify a = a.strip("\"\n") #destringify
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}00/{queryDate}00" url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url) response = requests.get(url)
if response.ok: if response.ok:
@ -115,7 +115,7 @@ def main():
# write out of the csv file # write out of the csv file
dw.writerow(jd) dw.writerow(jd)
# f_Out = outputPath + "dailyviews" + queryDate + ".feather" # f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file? # read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}") logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.") logging.info(f"Processed {success} successful URLs and {failure} failures.")

View File

@ -1,4 +1,4 @@
#!yusr/bin/env python3 #!/usr/bin/env python3
############################################################################### ###############################################################################
# #
@ -26,7 +26,6 @@ def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.') parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str) parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str), parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str), parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args() args = parser.parse_args()
@ -37,12 +36,6 @@ def main():
output_path = args.output_folder output_path = args.output_folder
article_filename = args.article_file article_filename = args.article_file
#handle -d
if args.query_date:
query_date = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
#handle -L #handle -L
loglevel_mapping = { 'debug' : logging.DEBUG, loglevel_mapping = { 'debug' : logging.DEBUG,
@ -66,12 +59,13 @@ def main():
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip() export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now()) export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}") logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}") logging.info(f"Last commit: {export_git_hash}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json") json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv") tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php") api_session = api.Session("https://en.wikipedia.org/w/api.php")