added support to parse namespaces from title

This is necessary for wikis (e.g., Wikia XML dumps) that do not include
namespace metadata as tags within each <page>.
This commit is contained in:
Benjamin Mako Hill 2015-07-23 12:12:20 -07:00
parent 108c8442b2
commit d934700ee9
2 changed files with 21 additions and 2 deletions

@ -1 +1 @@
Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7
Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d

21
wikiq
View File

@ -104,6 +104,22 @@ class WikiqParser():
self.collapse_user = collapse_user
self.persist = persist
self.printed_header = False
self.namespaces = []
def __get_namespace_from_title(self, title):
default_ns = None
for ns in self.namespaces:
# skip if the namespace is not defined
if ns == None:
default_ns = self.namespaces[ns]
continue
if title.startswith(ns + ":"):
return self.namespaces[ns]
# if we've made it this far with no matches, we return the default namespace
return default_ns
def process(self):
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
@ -116,6 +132,9 @@ class WikiqParser():
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
# extract list of namspaces
self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
page_count = 0
rev_count = 0
# Iterate through pages
@ -134,7 +153,7 @@ class WikiqParser():
'articleid' : page.id,
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
'title' : '"' + page.title + '"',
'namespace' : page.namespace,
'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
# if revisions are deleted, /many/ things will be missing