Merge pull request #12 from makoshark/master

substantial changes to wikipedia fetching code
This commit is contained in:
groceryheist 2020-04-01 16:36:56 -07:00 committed by GitHub
commit ff96d52cb9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 336 additions and 1484 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
wikipedia/data/
wikipedia/logs/
wikipedia/resources/enwp_wikiproject_covid19_articles.txt
__pycache__

View File

@ -0,0 +1,17 @@
#!/bin/bash -x
TZ="UTC"
date_string=$(date +%Y%m%d)
./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/enwp-wikiproject_scraper-${date_string}.log)
revs_log="enwp-revisions-${date_string}.log"
./wikipedia/scripts/fetch_enwiki_revisions.py 2> >(tee wikipedia/logs/${rev_log})
mv wikipedia/logs/${revs_log} /var/www/covid19/wikipedia/logs/
revs_tsv="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.tsv"
mv wikipedia/data/${revs_tsv} /var/www/covid19/wikipedia
revs_json="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.json"
xz wikipedia/data/${revs_json}
mv wikipedia/data/${revs_json}.xz /var/www/covid19/wikipedia

16
cron-wikipedia_views.sh Normal file
View File

@ -0,0 +1,16 @@
#!/bin/bash -x
TZ="UTC"
date_string=$(date +%Y%m%d)
./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/enwp-wikiproject_scraper-${date_string}.log)
# get the list of files
view_log="enwp-daily_views-${date_string}.log"
./wikipedia/scripts/fetch_enwiki_daily_views.py 2> >(tee wikipedia/logs/${view_log})
mv wikipedia/logs/${view_log} /var/www/covid19/wikipedia/logs/${view_log}
mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.tsv /var/www/covid19/wikipedia/
# xz wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json
mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json /var/www/covid19/wikipedia/

View File

@ -0,0 +1,27 @@
#!/usr/bin/env python3
import sys
import subprocess
import logging
def git_hash(short=False):
if short:
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
else:
subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
def get_loglevel(arg_loglevel):
loglevel_mapping = { 'debug' : logging.DEBUG,
'info' : logging.INFO,
'warning' : logging.WARNING,
'error' : logging.ERROR,
'critical' : logging.CRITICAL }
if arg_loglevel in loglevel_mapping:
loglevel = loglevel_mapping[arg_loglevel]
return loglevel
else:
print("Choose a valid log level: debug, info, warning, error, or critical", file=sys.stderr)
return logging.INFO

View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import sys
import requests
import argparse
import json
import time
import os.path
import datetime
import logging
from csv import DictWriter
import digobs
#import feather #TBD
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
outputPath = args.output_folder
articleFile = args.article_file
#handle -d
if args.query_date:
query_date = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
#1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
with open(articleFile, 'r') as infile:
articleList = list(map(str.strip, infile))
success = 0 #for logging how many work/fail
failure = 0
#3 Save results as a JSON and TSV
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
#2 Repeatedly call the API with that list of names
for a in articleList:
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url)
if response.ok:
jd = response.json()["items"][0]
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
# start writing the CSV File if it doesn't exist yet
try:
dw
except NameError:
dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
dw.writeheader()
logging.debug(f"printing data: {jd}")
# write out the line of the json file
print(json.dumps(jd), file=j_outfile)
# write out of the csv file
dw.writerow(jd)
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import argparse
import logging
import os.path
import json
import datetime
from requests import Request
from csv import DictWriter
from mw import api
import digobs
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
output_path = args.output_folder
article_filename = args.article_file
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php")
# list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities)
rv_props = {'revid' : 'ids',
'timestamp' : 'timestamp',
'user' : 'user',
'userid' : 'userid',
'size' : 'size',
'sha1' : 'sha1',
'contentmodel' : 'contentmodel',
'tags' : 'tags',
'flags' : 'flags',
'comment' : 'comment',
'content' : 'content' }
exclude_from_tsv = ['tags', 'comment', 'content', 'flags']
# load the list of articles
with open(article_filename, 'r') as infile:
article_list= list(map(str.strip, infile))
def get_revisions_for_page(title):
return api_session.revisions.query(properties=rv_props.values(),
titles={title},
direction="newer")
tsv_fields = ['title', 'pageid', 'namespace']
tsv_fields = tsv_fields + list(rv_props.keys())
# drop fields that we identified for exclusion
tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
# add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time }
with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output:
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader()
for article in article_list:
logging.info(f"pulling revisions for: {article}")
for rev in get_revisions_for_page(article):
logging.debug(f"processing raw revision: {rev}")
# add export metadata
rev['exported'] = export_info
# save the json version of the code
print(json.dumps(rev), file=json_output)
# handle missing data
if "sha1" not in rev:
rev["sha1"] = ""
if "userhidden" in rev:
rev["user"] = ""
rev["userid"] = ""
# recode anon so it's true or false instead of present/missing
if "anon" in rev:
rev["anon"] = True
else:
rev["anon"] = False
# let's recode "minor" in the same way
if "minor" in rev:
rev["minor"] = True
else:
rev["minor"] = False
# add page title information
rev['title'] = rev['page']['title']
rev['pageid'] = rev['page']['pageid']
rev['namespace'] = rev['page']['ns']
# construct a URL
rev['url'] = Request('GET', 'https://en.wikipedia.org/w/index.php',
params={'title' : rev['title'].replace(" ", "_"),
'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_time
rev['export_commit'] = digobs.git_hash(short=True)
tsv_writer.writerow({k: rev[k] for k in tsv_fields})
if __name__ == "__main__":
main()

View File

@ -18,58 +18,44 @@
############################################################################### ###############################################################################
import argparse import argparse
import subprocess
import requests import requests
import datetime import datetime
import logging import logging
import re import re
import math import math
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import digobs
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.') parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="../resources/", type=str) parser.add_argument('-o', '--output_file', help='Where to save output', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/') parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args() args = parser.parse_args()
return(args) return(args)
def main(): def main():
args = parse_args() args = parse_args()
outputFile = args.output_file
outputPath = args.output_folder
#handle -W #handle -W
today = datetime.datetime.today().strftime('%Y%m%d') if args.logging_destination:
dest = args.logging_destination logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
logHome = f"{dest}scraping{today}"
#handle -L
loglevel = args.logging_level
if loglevel == 'debug':
logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG)
elif loglevel == 'info':
logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO)
elif loglevel == 'warning':
logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING)
elif loglevel == 'error':
logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR)
elif loglevel == 'critical':
logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL)
else: else:
print("Choose a valid log level: debug, info, warning, error, or critical") logging.basicConfig(level=args.logging_level)
exit
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
logging.info(f"Starting at {export_time} and destructively outputting article list to {outputFile}.")
logging.info(f"Last commit: {export_git_hash}")
outputFile = f"{outputPath}articles.txt"
logging.debug(f"Starting scrape at {datetime.datetime.now()} and destructively outputting article list to {outputFile}.")
#1 How many hits to the fcgi? #1 How many hits to the fcgi?
#make a session
session = requests.Session() session = requests.Session()
originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality" originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality"

File diff suppressed because it is too large Load Diff

View File

@ -1,125 +0,0 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import requests
import argparse
import json
import csv
import time
import os.path
import datetime
import logging
#import feather #TBD
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API repeatedly.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="../data/", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="../resources/articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format please.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'),
parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/'),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
outputPath = args.output_folder
articleFile = args.article_file
#handle -d
if (args.query_date):
queryDate = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
queryDate = yesterday.strftime("%Y%m%d")
queryDate = queryDate + "00" #requires specifying hours
#handle -W
logHome = f"{args.logging_destination}dailylogrun{datetime.datetime.today().strftime('%Y%m%d')}"
#handle -L
loglevel = args.logging_level
if loglevel == 'debug':
logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG)
elif loglevel == 'info':
logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO)
elif loglevel == 'warning':
logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING)
elif loglevel == 'error':
logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR)
elif loglevel == 'critical':
logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL)
else:
print("Choose a valid log level: debug, info, warning, error, or critical")
exit
articleList = []
logging.debug(f"Starting run at {datetime.datetime.now()}")
#1 Load up the list of article names
j_Out = f"{outputPath}dailyviews{queryDate}.json"
t_Out = f"{outputPath}dailyviews{queryDate}.tsv"
with open(articleFile, 'r') as infile:
articleList = list(infile)
j = []
success = 0 #for logging how many work/fail
failure = 0
#2 Repeatedly call the API with that list of names
for a in articleList:
a = a.strip("\"\n") #destringify
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}"
response = requests.get(url)
if response.ok:
jd = json.loads(response.content)
j.append(jd["items"][0])
time.sleep(.1)
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
#3 Save results as a JSON and TSV
#all data in j now, make json file
logging.info(f"Processed {success} successful URLs and {failure} failures.")
with open(j_Out, 'w') as j_outfile:
json.dump(j, j_outfile, indent=2)
with open(t_Out, 'w') as t_outfile:
dw = csv.DictWriter(t_outfile, sorted(j[0].keys()), delimiter='\t')
dw.writeheader()
dw.writerows(j)
logging.debug(f"Run complete at {datetime.datetime.now()}")
# f_Out = outputPath + "dailyviews" + queryDate + ".feather"
# read the json back in and make a feather file?
if __name__ == "__main__":
main()