added support to parse namespaces from title
This is necessary for wikis (e.g., Wikia XML dumps) that do not include namespace metadata as tags within each <page>.
This commit is contained in:
parent
108c8442b2
commit
d934700ee9
@ -1 +1 @@
|
|||||||
Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7
|
Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d
|
21
wikiq
21
wikiq
@ -104,6 +104,22 @@ class WikiqParser():
|
|||||||
self.collapse_user = collapse_user
|
self.collapse_user = collapse_user
|
||||||
self.persist = persist
|
self.persist = persist
|
||||||
self.printed_header = False
|
self.printed_header = False
|
||||||
|
self.namespaces = []
|
||||||
|
|
||||||
|
def __get_namespace_from_title(self, title):
|
||||||
|
default_ns = None
|
||||||
|
|
||||||
|
for ns in self.namespaces:
|
||||||
|
# skip if the namespace is not defined
|
||||||
|
if ns == None:
|
||||||
|
default_ns = self.namespaces[ns]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if title.startswith(ns + ":"):
|
||||||
|
return self.namespaces[ns]
|
||||||
|
|
||||||
|
# if we've made it this far with no matches, we return the default namespace
|
||||||
|
return default_ns
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
|
print("Processing file: %s" % self.input_file.name, file=sys.stderr)
|
||||||
@ -116,6 +132,9 @@ class WikiqParser():
|
|||||||
# Construct dump file iterator
|
# Construct dump file iterator
|
||||||
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
|
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
|
||||||
|
|
||||||
|
# extract list of namspaces
|
||||||
|
self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
|
||||||
|
|
||||||
page_count = 0
|
page_count = 0
|
||||||
rev_count = 0
|
rev_count = 0
|
||||||
# Iterate through pages
|
# Iterate through pages
|
||||||
@ -134,7 +153,7 @@ class WikiqParser():
|
|||||||
'articleid' : page.id,
|
'articleid' : page.id,
|
||||||
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
|
'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
|
||||||
'title' : '"' + page.title + '"',
|
'title' : '"' + page.title + '"',
|
||||||
'namespace' : page.namespace,
|
'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
|
||||||
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
|
'deleted' : "TRUE" if rev.text.deleted else "FALSE" }
|
||||||
|
|
||||||
# if revisions are deleted, /many/ things will be missing
|
# if revisions are deleted, /many/ things will be missing
|
||||||
|
Loading…
Reference in New Issue
Block a user