Refactor revision parsing logic to be columnar #1
							
								
								
									
										15
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								wikiq
									
									
									
									
									
								
							| @ -110,16 +110,13 @@ class WikiqPage: | ||||
| 
 | ||||
|         if not self.collapse_user: | ||||
|             for rev in self.mwpage: | ||||
|                 yield rev | ||||
|                 yield [rev] | ||||
|             return | ||||
| 
 | ||||
|         for _, revs in groupby(self.mwpage, self.user_text): | ||||
|             # All revisions are either from the same user, or this is a single | ||||
|             # revision where the user is missing. | ||||
|             revs = list(revs) | ||||
|             rev = revs[-1] | ||||
|             rev.collapsed_revs = len(revs) | ||||
|             yield rev | ||||
|             yield revs | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         return self.__revisions | ||||
| @ -461,8 +458,6 @@ class WikiqParser: | ||||
|         if self.output_parquet: | ||||
|             writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark') | ||||
|         else: | ||||
|             print(self.output_file, file=sys.stderr) | ||||
|             print(self.schema, file=sys.stderr) | ||||
|             writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t')) | ||||
| 
 | ||||
|         # Iterate through pages | ||||
| @ -497,7 +492,9 @@ class WikiqParser: | ||||
|                     state = persistence.State() | ||||
| 
 | ||||
|             # Iterate through a page's revisions | ||||
|             for rev in page: | ||||
|             for revs in page: | ||||
|                 revs = list(revs) | ||||
|                 rev = revs[-1] | ||||
| 
 | ||||
|                 editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id | ||||
|                 # create a new data object instead of a dictionary. | ||||
| @ -554,7 +551,7 @@ class WikiqParser: | ||||
| 
 | ||||
|                 # if collapse user was on, let's run that | ||||
|                 if self.collapse_user: | ||||
|                     rev_data.collapsed_revs = rev.collapsed_revs | ||||
|                     rev_data.collapsed_revs = len(revs) | ||||
| 
 | ||||
|                 # get the  | ||||
|                 if self.persist != PersistMethod.none: | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user