94 lines
2.6 KiB
Python
94 lines
2.6 KiB
Python
# Functions common to both scrapers
|
|
# Copyright (C) 2018 Nathan TeBlunthuis
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
# returns an iterator of wiki,url tuples
|
|
import pandas as pd
|
|
from os import makedirs, path
|
|
from shutil import rmtree
|
|
from itertools import islice
|
|
|
|
|
|
def _add_wikitype(tpl):
|
|
print(tpl)
|
|
wiki, url = tpl[0:2]
|
|
wikitype = "NA"
|
|
|
|
if "wikipedia.org" in url:
|
|
wikitype = "wikipedia"
|
|
url = url + '/w/'
|
|
|
|
elif "wikia.com" in url:
|
|
wikitype = 'wikia'
|
|
|
|
print(url)
|
|
print(wiki)
|
|
url = url.strip()
|
|
wiki = wiki.strip()
|
|
tpl = (wiki, url, wikitype)
|
|
return tpl
|
|
|
|
|
|
def read_wikilist(args):
|
|
if args.sep in ['\\t', '\t', 'tab', 't']:
|
|
sep = '\t'
|
|
else:
|
|
sep = args.sep
|
|
|
|
if not args.no_header:
|
|
wikilist = pd.read_table(args.wikilist, sep=sep)
|
|
wikilist = ((t.dbname, t.url)
|
|
for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
|
|
|
|
else:
|
|
j, k = [int(i) for i in args.i.split(',')[0:2]]
|
|
print(args.i)
|
|
wikilist = open(args.wikilist)
|
|
wikilist = (line.split(sep) for line in wikilist)
|
|
wikilist = ((fields[j], fields[k]) for fields in wikilist)
|
|
wikilist = islice(wikilist, 1, None)
|
|
|
|
wikilist = (_add_wikitype(t) for t in wikilist)
|
|
return wikilist
|
|
|
|
|
|
def add_parser_arguments(parser):
|
|
parser.add_argument('--no-header', action='store_true',
|
|
help='does the wikilist have no header?')
|
|
|
|
parser.add_argument('--nuke-old', action='store_true',
|
|
help='remove old files.')
|
|
|
|
parser.add_argument('--sep', type=str,
|
|
help='input table delimiter', default=',')
|
|
|
|
parser.add_argument(
|
|
'wikilist',
|
|
type=str,
|
|
help='path to the input file: a wiki list with wiki\turl\filename')
|
|
|
|
parser.add_argument(
|
|
'output',
|
|
type=str,
|
|
help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
|
|
|
|
parser.add_argument('-i',
|
|
type=str,
|
|
help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
|
|
default='0,1')
|
|
|
|
return(parser)
|
|
|
|
|
|
def prepare_output(output_path, nuke_old):
|
|
if not path.exists(output_path):
|
|
makedirs(output_path)
|
|
if nuke_old:
|
|
rmtree(output_path)
|
|
if not path.exists(output_path):
|
|
makedirs(output_path)
|