wikia_userroles_scraper/scraper_utils.py

# Functions common to both scrapers
# Copyright (C) 2018  Nathan TeBlunthuis

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# returns an iterator of wiki,url tuples
import pandas as pd
from os import makedirs, path
from shutil import rmtree
from itertools import islice


def _add_wikitype(tpl):
    print(tpl)
    wiki, url = tpl[0:2]
    wikitype = "NA"

    if "wikipedia.org" in url:
        wikitype = "wikipedia"
        url = url + '/w/'

    elif "wikia.com" in url:
        wikitype = 'wikia'

    print(url)
    print(wiki)
    url = url.strip()
    wiki = wiki.strip()
    tpl = (wiki, url, wikitype)
    return tpl


def read_wikilist(args):
    if args.sep in ['\\t', '\t', 'tab', 't']:
        sep = '\t'
    else:
        sep = args.sep

    if not args.no_header:
        wikilist = pd.read_table(args.wikilist, sep=sep)
        wikilist = ((t.dbname, t.url)
                    for t in wikilist.loc[:, ['dbname', 'url']].itertuples())

    else:
        j, k = [int(i) for i in args.i.split(',')[0:2]]
        print(args.i)
        wikilist = open(args.wikilist)
        wikilist = (line.split(sep) for line in wikilist)
        wikilist = ((fields[j], fields[k]) for fields in wikilist)
        wikilist = islice(wikilist, 1, None)

    wikilist = (_add_wikitype(t) for t in wikilist)
    return wikilist


def add_parser_arguments(parser):
    parser.add_argument('--no-header', action='store_true',
                        help='does the wikilist have no header?')

    parser.add_argument('--nuke-old', action='store_true',
                        help='remove old files.')

    parser.add_argument('--sep', type=str,
                        help='input table delimiter', default=',')

    parser.add_argument(
        'wikilist',
        type=str,
        help='path to the input file: a wiki list with wiki\turl\filename')

    parser.add_argument(
        'output',
        type=str,
        help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')

    parser.add_argument('-i',
                        type=str,
                        help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
                        default='0,1')

    return(parser)


def prepare_output(output_path, nuke_old):
    if not path.exists(output_path):
        makedirs(output_path)
    if nuke_old:
        rmtree(output_path)
    if not path.exists(output_path):
        makedirs(output_path)