1 Commits

Author SHA1 Message Date
bf396ad366 Prefix page titles with namespace names. 2018-07-09 22:11:17 -07:00
8 changed files with 49009 additions and 49005 deletions

Submodule Mediawiki-Utilities deleted from f7329417eb

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

17
wikiq
View File

@@ -3,6 +3,7 @@
# original wikiq headers are: title articleid revid date_time anon # original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion # editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size # additions_size deletions_size
import pdb
import argparse import argparse
import sys import sys
import os, os.path import os, os.path
@@ -32,11 +33,15 @@ class WikiqIterator():
self.fh = fh self.fh = fh
self.collapse_user = collapse_user self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh) self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in
self.mwiterator.site_info.namespaces }
self.__pages = self.load_pages() self.__pages = self.load_pages()
def load_pages(self): def load_pages(self):
for page in self.mwiterator: for page in self.mwiterator:
yield WikiqPage(page, collapse_user=self.collapse_user) yield WikiqPage(page,
namespace_map = self.namespace_map,
collapse_user=self.collapse_user)
def __iter__(self): def __iter__(self):
return self.__pages return self.__pages
@@ -49,13 +54,14 @@ class WikiqPage():
'restrictions', 'mwpage', '__revisions', 'restrictions', 'mwpage', '__revisions',
'collapse_user') 'collapse_user')
def __init__(self, page, collapse_user=False): def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id self.id = page.id
self.title = page.title
self.namespace = page.namespace self.namespace = page.namespace
self.redirect = page.redirect if page.namespace != 0:
self.title = ':'.join([namespace_map[page.namespace], page.title])
else:
self.title = page.title
self.restrictions = page.restrictions self.restrictions = page.restrictions
self.collapse_user = collapse_user self.collapse_user = collapse_user
self.mwpage = page self.mwpage = page
self.__revisions = self.rev_list() self.__revisions = self.rev_list()
@@ -111,7 +117,6 @@ class WikiqPage():
class WikiqParser(): class WikiqParser():
def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False): def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
self.input_file = input_file self.input_file = input_file