Initialize the repository for the wikia user roles scraper project.
This commit is contained in:
85
scraper_utils.py
Normal file
85
scraper_utils.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# returns an iterator of wiki,url tuples
|
||||
import pandas as pd
|
||||
from os import makedirs, path
|
||||
from shutil import rmtree
|
||||
from itertools import islice
|
||||
|
||||
|
||||
def _add_wikitype(tpl):
|
||||
print(tpl)
|
||||
wiki, url = tpl[0:2]
|
||||
wikitype = "NA"
|
||||
|
||||
if "wikipedia.org" in url:
|
||||
wikitype = "wikipedia"
|
||||
url = url + '/w/'
|
||||
|
||||
elif "wikia.com" in url:
|
||||
wikitype = 'wikia'
|
||||
|
||||
print(url)
|
||||
print(wiki)
|
||||
url = url.strip()
|
||||
wiki = wiki.strip()
|
||||
tpl = (wiki, url, wikitype)
|
||||
return tpl
|
||||
|
||||
|
||||
def read_wikilist(args):
|
||||
if args.sep in ['\\t', '\t', 'tab', 't']:
|
||||
sep = '\t'
|
||||
else:
|
||||
sep = args.sep
|
||||
|
||||
if not args.no_header:
|
||||
wikilist = pd.read_table(args.wikilist, sep=sep)
|
||||
wikilist = ((t.dbname, t.url)
|
||||
for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
|
||||
|
||||
else:
|
||||
j, k = [int(i) for i in args.i.split(',')[0:2]]
|
||||
print(args.i)
|
||||
wikilist = open(args.wikilist)
|
||||
wikilist = (line.split(sep) for line in wikilist)
|
||||
wikilist = ((fields[j], fields[k]) for fields in wikilist)
|
||||
wikilist = islice(wikilist, 1, None)
|
||||
|
||||
wikilist = (_add_wikitype(t) for t in wikilist)
|
||||
return wikilist
|
||||
|
||||
|
||||
def add_parser_arguments(parser):
|
||||
parser.add_argument('--no-header', action='store_true',
|
||||
help='does the wikilist have no header?')
|
||||
|
||||
parser.add_argument('--nuke-old', action='store_true',
|
||||
help='remove old files.')
|
||||
|
||||
parser.add_argument('--sep', type=str,
|
||||
help='input table delimiter', default=',')
|
||||
|
||||
parser.add_argument(
|
||||
'wikilist',
|
||||
type=str,
|
||||
help='path to the input file: a wiki list with wiki\turl\filename')
|
||||
|
||||
parser.add_argument(
|
||||
'output',
|
||||
type=str,
|
||||
help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
|
||||
|
||||
parser.add_argument('-i',
|
||||
type=str,
|
||||
help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
|
||||
default='0,1')
|
||||
|
||||
return(parser)
|
||||
|
||||
|
||||
def prepare_output(output_path, nuke_old):
|
||||
if not path.exists(output_path):
|
||||
makedirs(output_path)
|
||||
if nuke_old:
|
||||
rmtree(output_path)
|
||||
if not path.exists(output_path):
|
||||
makedirs(output_path)
|
||||
Reference in New Issue
Block a user