added support to parse namespaces from title
This is necessary for wikis (e.g., Wikia XML dumps) that do not include namespace metadata as tags within each <page>.
This commit is contained in:
		
							parent
							
								
									108c8442b2
								
							
						
					
					
						commit
						d934700ee9
					
				| @ -1 +1 @@ | ||||
| Subproject commit ddd3ea3442ca0450ab16f88c5fab674551d35ee7 | ||||
| Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d | ||||
							
								
								
									
										21
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								wikiq
									
									
									
									
									
								
							| @ -104,6 +104,22 @@ class WikiqParser(): | ||||
|         self.collapse_user = collapse_user | ||||
|         self.persist = persist | ||||
|         self.printed_header = False | ||||
|         self.namespaces = [] | ||||
| 
 | ||||
|     def __get_namespace_from_title(self, title): | ||||
|         default_ns = None | ||||
| 
 | ||||
|         for ns in self.namespaces: | ||||
|             # skip if the namespace is not defined | ||||
|             if ns == None: | ||||
|                 default_ns = self.namespaces[ns] | ||||
|                 continue | ||||
| 
 | ||||
|             if title.startswith(ns + ":"): | ||||
|                 return self.namespaces[ns] | ||||
| 
 | ||||
|         # if we've made it this far with no matches, we return the default namespace | ||||
|         return default_ns | ||||
| 
 | ||||
|     def process(self): | ||||
|         print("Processing file: %s" % self.input_file.name, file=sys.stderr) | ||||
| @ -116,6 +132,9 @@ class WikiqParser(): | ||||
|         # Construct dump file iterator | ||||
|         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) | ||||
| 
 | ||||
|         # extract list of namspaces | ||||
|         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces} | ||||
| 
 | ||||
|         page_count = 0 | ||||
|         rev_count = 0 | ||||
|         # Iterate through pages | ||||
| @ -134,7 +153,7 @@ class WikiqParser(): | ||||
|                             'articleid' : page.id, | ||||
|                             'editor_id' : "" if rev.contributor.id == None else rev.contributor.id, | ||||
|                             'title' : '"' + page.title + '"', | ||||
|                             'namespace' : page.namespace, | ||||
|                             'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title), | ||||
|                             'deleted' : "TRUE" if rev.text.deleted else "FALSE" }  | ||||
| 
 | ||||
|                 # if revisions are deleted, /many/ things will be missing | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user