changes to a bunch of the wikipedia view code

- Renamed the articles.txt to something more specific

Changes to both scripts:

- Updated filenames to match the new standard
- Reworked the logging code so that it can write to stderr by
  default. Because we can only call logging.basicConfig() once, this
  eneded up being a bigger changes.
- Caused scripts to output git commits and export to track which code
  produced which dataset.
- Caused programs to take files instead of directories as
  output (allows us to run programs more than once a day).

Changes to the wikipedia_views/scripts/fetch_daily_views.py:

- Change output that it outputs a sequence of JSON dictionaries (one
  per line) as per the standard we agreed to and which is what
  Twitter, Github, and other dumps do. Previous behavior was to create
  output a single JSON list object.
- A number of other small changes and tweaks throughout.
This commit is contained in:
Benjamin Mako Hill 2020-04-01 07:15:12 -07:00
parent 72bf7bcd37
commit 38fdd07b39
3 changed files with 1003 additions and 824 deletions

View File

@ -9,31 +9,30 @@
# #
############################################################################### ###############################################################################
import sys
import subprocess
import requests import requests
import argparse import argparse
import json import json
import csv
import time import time
import os.path import os.path
import argparse
import datetime import datetime
import logging import logging
from csv import DictWriter
#import feather #TBD #import feather #TBD
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Call the views API repeatedly.') parser = argparse.ArgumentParser(description='Call the views API to collect data view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="../data/", type=str) parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia_views/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="../resources/articles.txt", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia_views/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format please.', type=str) parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/'), parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args() args = parser.parse_args()
return(args) return(args)
def main(): def main():
args = parse_args() args = parse_args()
@ -42,7 +41,7 @@ def main():
articleFile = args.article_file articleFile = args.article_file
#handle -d #handle -d
if (args.query_date): if args.query_date:
queryDate = args.query_date queryDate = args.query_date
else: else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1) yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
@ -50,74 +49,78 @@ def main():
queryDate = queryDate + "00" #requires specifying hours queryDate = queryDate + "00" #requires specifying hours
#handle -W
logHome = f"{args.logging_destination}dailylogrun{datetime.datetime.today().strftime('%Y%m%d')}"
#handle -L #handle -L
loglevel = args.logging_level loglevel_mapping = { 'debug' : logging.DEBUG,
if loglevel == 'debug': 'info' : logging.INFO,
logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG) 'warning' : logging.WARNING,
elif loglevel == 'info': 'error' : logging.ERROR,
logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO) 'critical' : logging.CRITICAL }
elif loglevel == 'warning':
logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING) if args.logging_level in loglevel_mapping:
elif loglevel == 'error': loglevel = loglevel_mapping[args.logging_level]
logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR) else:
elif loglevel == 'critical':
logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL)
else:
print("Choose a valid log level: debug, info, warning, error, or critical") print("Choose a valid log level: debug, info, warning, error, or critical")
exit exit
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
else:
logging.basicConfig(level=loglevel)
articleList = [] export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
logging.debug(f"Starting run at {datetime.datetime.now()}") export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}")
#1 Load up the list of article names #1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json")
j_Out = f"{outputPath}dailyviews{queryDate}.json" t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv")
t_Out = f"{outputPath}dailyviews{queryDate}.tsv"
with open(articleFile, 'r') as infile: with open(articleFile, 'r') as infile:
articleList = list(infile) articleList = list(infile)
j = []
success = 0 #for logging how many work/fail success = 0 #for logging how many work/fail
failure = 0 failure = 0
#2 Repeatedly call the API with that list of names
for a in articleList:
a = a.strip("\"\n") #destringify
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}"
response = requests.get(url)
if response.ok:
jd = json.loads(response.content)
j.append(jd["items"][0])
time.sleep(.1)
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
#3 Save results as a JSON and TSV #3 Save results as a JSON and TSV
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
#all data in j now, make json file #2 Repeatedly call the API with that list of names
logging.info(f"Processed {success} successful URLs and {failure} failures.") for a in articleList:
a = a.strip("\"\n") #destringify
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}"
with open(j_Out, 'w') as j_outfile: response = requests.get(url)
json.dump(j, j_outfile, indent=2) if response.ok:
jd = response.json()["items"][0]
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
with open(t_Out, 'w') as t_outfile: # start writing the CSV File if it doesn't exist yet
dw = csv.DictWriter(t_outfile, sorted(j[0].keys()), delimiter='\t') try:
dw.writeheader() dw
dw.writerows(j) except NameError:
dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
dw.writeheader()
logging.debug(f"Run complete at {datetime.datetime.now()}") logging.debug(f"printing data: {jd}")
# write out the line of the json file
print(json.dumps(jd), file=j_outfile)
# write out of the csv file
dw.writerow(jd)
# f_Out = outputPath + "dailyviews" + queryDate + ".feather" # f_Out = outputPath + "dailyviews" + queryDate + ".feather"
# read the json back in and make a feather file? # read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -18,6 +18,7 @@
############################################################################### ###############################################################################
import argparse import argparse
import subprocess
import requests import requests
import datetime import datetime
import logging import logging
@ -28,48 +29,45 @@ from bs4 import BeautifulSoup
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.') parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="../resources/", type=str) parser.add_argument('-o', '--output_file', help='Where to save output', default="wikipedia_views/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'),
parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/') parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args() args = parser.parse_args()
return(args) return(args)
def main(): def main():
args = parse_args() args = parse_args()
outputFile = args.output_file
outputPath = args.output_folder
#handle -W
today = datetime.datetime.today().strftime('%Y%m%d')
dest = args.logging_destination
logHome = f"{dest}scraping{today}"
#handle -L #handle -L
loglevel = args.logging_level loglevel_mapping = { 'debug' : logging.DEBUG,
if loglevel == 'debug': 'info' : logging.INFO,
logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG) 'warning' : logging.WARNING,
elif loglevel == 'info': 'error' : logging.ERROR,
logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO) 'critical' : logging.CRITICAL }
elif loglevel == 'warning':
logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING) if args.logging_level in loglevel_mapping:
elif loglevel == 'error': loglevel = loglevel_mapping[args.logging_level]
logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR)
elif loglevel == 'critical':
logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL)
else: else:
print("Choose a valid log level: debug, info, warning, error, or critical") print("Choose a valid log level: debug, info, warning, error, or critical")
exit exit
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
else:
logging.basicConfig(level=loglevel)
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
logging.info(f"Starting at {export_time} and destructively outputting article list to {outputFile}.")
logging.info(f"Last commit: {export_git_hash}")
outputFile = f"{outputPath}articles.txt"
logging.debug(f"Starting scrape at {datetime.datetime.now()} and destructively outputting article list to {outputFile}.")
#1 How many hits to the fcgi? #1 How many hits to the fcgi?
#make a session
session = requests.Session() session = requests.Session()
originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality" originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality"