mediawiki_dump_tools/bin/wikiq

#!/usr/bin/env python3

# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import argparse
import sys
import os
sys.path.append("..")
from wikiq_util import calculate_persistence
from wikiq_util import WikiqIterator
from wikiq_util import WikiqPage
from wikiq_util import WikiqParser
from wikiq_util import open_input_file
from wikiq_util import open_output_file


parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')

# arguments for the input direction
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
                    help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")

parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
                    help="Directory for output files.")

parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
                    help="Write output to standard out (do not create dump file)")

parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")

parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")

parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
                    help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")

parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
                    help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")

args = parser.parse_args()

if len(args.dumpfiles) > 0:
    for filename in args.dumpfiles:
        input_file = open_input_file(filename)

        # open directory for output
        if args.output_dir:
            output_dir = args.output_dir[0]
        else:
            output_dir = "."

        print("Processing file: %s" % filename, file=sys.stderr)

        if args.stdout:
            output_file = sys.stdout
        else:
            filename = os.path.join(output_dir, os.path.basename(filename))
            output_file = open_output_file(filename)

        wikiq = WikiqParser(input_file, output_file,
                            collapse_user=args.collapse_user,
                            persist=args.persist,
                            persist_legacy=args.persist_legacy,
                            urlencode=args.urlencode)


        wikiq.process()

        # close things
        input_file.close()
        output_file.close()
else:
    wikiq = WikiqParser(sys.stdin, sys.stdout,
                        collapse_user=args.collapse_user,
                        persist=args.persist,
                        persist_legacy=args.persist_legacy,
                        urlencode=args.urlencode)
    wikiq.process()

# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
# stop_words = stop_words.split(",")