Prefix page titles with namespace names.

This commit is contained in:
2018-07-09 22:11:17 -07:00
parent dba793c6ac
commit bf396ad366
8 changed files with 49009 additions and 49005 deletions

17
wikiq
View File

@@ -3,6 +3,7 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import pdb
import argparse
import sys
import os, os.path
@@ -32,11 +33,15 @@ class WikiqIterator():
self.fh = fh
self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in
self.mwiterator.site_info.namespaces }
self.__pages = self.load_pages()
def load_pages(self):
for page in self.mwiterator:
yield WikiqPage(page, collapse_user=self.collapse_user)
yield WikiqPage(page,
namespace_map = self.namespace_map,
collapse_user=self.collapse_user)
def __iter__(self):
return self.__pages
@@ -49,13 +54,14 @@ class WikiqPage():
'restrictions', 'mwpage', '__revisions',
'collapse_user')
def __init__(self, page, collapse_user=False):
def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id
self.title = page.title
self.namespace = page.namespace
self.redirect = page.redirect
if page.namespace != 0:
self.title = ':'.join([namespace_map[page.namespace], page.title])
else:
self.title = page.title
self.restrictions = page.restrictions
self.collapse_user = collapse_user
self.mwpage = page
self.__revisions = self.rev_list()
@@ -111,7 +117,6 @@ class WikiqPage():
class WikiqParser():
def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
self.input_file = input_file