added support to parse namespaces from title
This is necessary for wikis (e.g., Wikia XML dumps) that do not include namespace metadata as tags within each <page>.
This commit is contained in:
parent
108c8442b2
commit
d934700ee9
@ -1 +1 @@
|
||||
Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7
|
||||
Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d
|
21
wikiq
21
wikiq
@ -104,6 +104,22 @@ class WikiqParser():
|
||||
self.collapse_user = collapse_user
|
||||
self.persist = persist
|
||||
self.printed_header = False
|
||||
self.namespaces = []
|
||||
|
||||
def __get_namespace_from_title(self, title):
|
||||
default_ns = None
|
||||
|
||||
for ns in self.namespaces:
|
||||
# skip if the namespace is not defined
|
||||
if ns == None:
|
||||
default_ns = self.namespaces[ns]
|
||||
continue
|
||||
|
||||
if title.startswith(ns + ":"):
|
||||
return self.namespaces[ns]
|
||||
|
||||
# if we've made it this far with no matches, we return the default namespace
|
||||
return default_ns
|
||||
|
||||
def process(self):
|
||||
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
|
||||
@ -116,6 +132,9 @@ class WikiqParser():
|
||||
# Construct dump file iterator
|
||||
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
|
||||
|
||||
# extract list of namspaces
|
||||
self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
|
||||
|
||||
page_count = 0
|
||||
rev_count = 0
|
||||
# Iterate through pages
|
||||
@ -134,7 +153,7 @@ class WikiqParser():
|
||||
'articleid' : page.id,
|
||||
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
|
||||
'title' : '"' + page.title + '"',
|
||||
'namespace' : page.namespace,
|
||||
'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
|
||||
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
|
||||
|
||||
# if revisions are deleted, /many/ things will be missing
|
||||
|
Loading…
Reference in New Issue
Block a user