address confusion with date
The timestamps in files should be the day that the exports are done. For the view data, the query date needs to be the day before but this shouldn't be the timestamp we use in files, etc.
This commit is contained in:
parent
06d2fd1563
commit
04e00f363b
@ -23,7 +23,6 @@ from csv import DictWriter
|
||||
#import feather #TBD
|
||||
|
||||
def parse_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
|
||||
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
|
||||
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
|
||||
@ -42,10 +41,10 @@ def main():
|
||||
|
||||
#handle -d
|
||||
if args.query_date:
|
||||
queryDate = args.query_date
|
||||
query_date = args.query_date
|
||||
else:
|
||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
queryDate = yesterday.strftime("%Y%m%d")
|
||||
query_date = yesterday.strftime("%Y%m%d")
|
||||
|
||||
#handle -L
|
||||
loglevel_mapping = { 'debug' : logging.DEBUG,
|
||||
@ -69,13 +68,14 @@ def main():
|
||||
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
|
||||
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
export_time = str(datetime.datetime.now())
|
||||
export_date = datetime.datetime.today().strftime("%Y%m%d")
|
||||
|
||||
logging.info(f"Starting run at {export_time}")
|
||||
logging.info(f"Last commit: {export_git_hash}")
|
||||
|
||||
#1 Load up the list of article names
|
||||
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json")
|
||||
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv")
|
||||
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
|
||||
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
|
||||
|
||||
with open(articleFile, 'r') as infile:
|
||||
articleList = list(infile)
|
||||
@ -90,7 +90,7 @@ def main():
|
||||
#2 Repeatedly call the API with that list of names
|
||||
for a in articleList:
|
||||
a = a.strip("\"\n") #destringify
|
||||
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}00/{queryDate}00"
|
||||
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
|
||||
|
||||
response = requests.get(url)
|
||||
if response.ok:
|
||||
@ -115,7 +115,7 @@ def main():
|
||||
# write out of the csv file
|
||||
dw.writerow(jd)
|
||||
|
||||
# f_Out = outputPath + "dailyviews" + queryDate + ".feather"
|
||||
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
|
||||
# read the json back in and make a feather file?
|
||||
logging.debug(f"Run complete at {datetime.datetime.now()}")
|
||||
logging.info(f"Processed {success} successful URLs and {failure} failures.")
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!yusr/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
###############################################################################
|
||||
#
|
||||
@ -26,7 +26,6 @@ def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
|
||||
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
|
||||
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
|
||||
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
|
||||
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
|
||||
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
|
||||
args = parser.parse_args()
|
||||
@ -37,12 +36,6 @@ def main():
|
||||
|
||||
output_path = args.output_folder
|
||||
article_filename = args.article_file
|
||||
#handle -d
|
||||
if args.query_date:
|
||||
query_date = args.query_date
|
||||
else:
|
||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
query_date = yesterday.strftime("%Y%m%d")
|
||||
|
||||
#handle -L
|
||||
loglevel_mapping = { 'debug' : logging.DEBUG,
|
||||
@ -66,12 +59,13 @@ def main():
|
||||
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
|
||||
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
export_time = str(datetime.datetime.now())
|
||||
export_date = datetime.datetime.today().strftime("%Y%m%d")
|
||||
|
||||
logging.info(f"Starting run at {export_time}")
|
||||
logging.info(f"Last commit: {export_git_hash}")
|
||||
|
||||
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json")
|
||||
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv")
|
||||
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
|
||||
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
|
||||
|
||||
api_session = api.Session("https://en.wikipedia.org/w/api.php")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user