Prefix page titles with namespace names.

This commit is contained in:
2018-07-09 22:11:17 -07:00
parent dba793c6ac
commit bf396ad366
8 changed files with 49009 additions and 49005 deletions

Submodule Mediawiki-Utilities deleted from f7329417eb

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

17
wikiq
View File

@@ -3,6 +3,7 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import pdb
import argparse
import sys
import os, os.path
@@ -32,11 +33,15 @@ class WikiqIterator():
self.fh = fh
self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in
self.mwiterator.site_info.namespaces }
self.__pages = self.load_pages()
def load_pages(self):
for page in self.mwiterator:
yield WikiqPage(page, collapse_user=self.collapse_user)
yield WikiqPage(page,
namespace_map = self.namespace_map,
collapse_user=self.collapse_user)
def __iter__(self):
return self.__pages
@@ -49,13 +54,14 @@ class WikiqPage():
'restrictions', 'mwpage', '__revisions',
'collapse_user')
def __init__(self, page, collapse_user=False):
def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id
self.title = page.title
self.namespace = page.namespace
self.redirect = page.redirect
if page.namespace != 0:
self.title = ':'.join([namespace_map[page.namespace], page.title])
else:
self.title = page.title
self.restrictions = page.restrictions
self.collapse_user = collapse_user
self.mwpage = page
self.__revisions = self.rev_list()
@@ -111,7 +117,6 @@ class WikiqPage():
class WikiqParser():
def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
self.input_file = input_file