Prefix page titles with namespace names.

This commit is contained in:
Nathan TeBlunthuis 2018-07-09 22:11:17 -07:00
parent dba793c6ac
commit bf396ad366
8 changed files with 49009 additions and 49005 deletions

@ -1 +0,0 @@
Subproject commit f7329417ebb2f03d1e9b8a626236a3c0ce65c814

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

17
wikiq
View File

@ -3,6 +3,7 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import pdb
import argparse
import sys
import os, os.path
@ -32,11 +33,15 @@ class WikiqIterator():
self.fh = fh
self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in
self.mwiterator.site_info.namespaces }
self.__pages = self.load_pages()
def load_pages(self):
for page in self.mwiterator:
yield WikiqPage(page, collapse_user=self.collapse_user)
yield WikiqPage(page,
namespace_map = self.namespace_map,
collapse_user=self.collapse_user)
def __iter__(self):
return self.__pages
@ -49,13 +54,14 @@ class WikiqPage():
'restrictions', 'mwpage', '__revisions',
'collapse_user')
def __init__(self, page, collapse_user=False):
def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id
self.title = page.title
self.namespace = page.namespace
self.redirect = page.redirect
if page.namespace != 0:
self.title = ':'.join([namespace_map[page.namespace], page.title])
else:
self.title = page.title
self.restrictions = page.restrictions
self.collapse_user = collapse_user
self.mwpage = page
self.__revisions = self.rev_list()
@ -111,7 +117,6 @@ class WikiqPage():
class WikiqParser():
def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
self.input_file = input_file