Refactor collapse user logic
Use simple loop for when we aren't collapsing users. Add test which covers case when users are deleted. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									c0e629a313
								
							
						
					
					
						commit
						aec6e5fafa
					
				| @ -177,6 +177,19 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_WP_collapse_user(self): | ||||
|         tester = WikiqTester(IKWIKI, "collapse_user") | ||||
| 
 | ||||
|         try: | ||||
|             tester.call_wikiq("--collapse-user") | ||||
|         except subprocess.CalledProcessError as exc: | ||||
|             self.fail(exc.stderr.decode("utf8")) | ||||
| 
 | ||||
|         copyfile(tester.call_output, tester.test_file) | ||||
|         test = pd.read_table(tester.test_file) | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_noargs(self): | ||||
|         tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") | ||||
| 
 | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										73
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										73
									
								
								wikiq
									
									
									
									
									
								
							| @ -102,35 +102,28 @@ class WikiqPage: | ||||
|         #         3          A               B            True | ||||
|         #         4          A               A           False | ||||
|         # Post-loop                          A          Always | ||||
|         collapsed_revs = 0 | ||||
|         for i, rev in enumerate(self.mwpage): | ||||
|             # never yield the first time | ||||
|             if i == 0: | ||||
|                 if self.collapse_user: | ||||
|                     collapsed_revs = 1 | ||||
|                     rev.collapsed_revs = collapsed_revs | ||||
| 
 | ||||
|         if not self.collapse_user: | ||||
|             for rev in self.mwpage: | ||||
|                 yield rev | ||||
|             return | ||||
| 
 | ||||
|         collapsed_revs = 1 | ||||
|         prev_rev = next(self.mwpage) | ||||
|         prev_rev.collapsed_revs = collapsed_revs | ||||
| 
 | ||||
|         for rev in self.mwpage: | ||||
|             # yield if this is the last edit in a seq by a user and reset | ||||
|             # also yield if we don't know who the user is | ||||
| 
 | ||||
|             if rev.deleted.user or prev_rev.deleted.user or rev.user.text != prev_rev.user.text: | ||||
|                 yield prev_rev | ||||
|                 collapsed_revs = 1 | ||||
|                 rev.collapsed_revs = collapsed_revs | ||||
|             else: | ||||
|                 if self.collapse_user: | ||||
|                     # yield if this is the last edit in a seq by a user and reset | ||||
|                     # also yield if we do know who the user is | ||||
| 
 | ||||
|                     if rev.deleted.user or prev_rev.deleted.user: | ||||
|                         yield prev_rev | ||||
|                         collapsed_revs = 1 | ||||
|                         rev.collapsed_revs = collapsed_revs | ||||
| 
 | ||||
|                     elif not rev.user.text == prev_rev.user.text: | ||||
|                         yield prev_rev | ||||
|                         collapsed_revs = 1 | ||||
|                         rev.collapsed_revs = collapsed_revs | ||||
|                     # otherwise, add one to the counter | ||||
|                     else: | ||||
|                         collapsed_revs += 1 | ||||
|                         rev.collapsed_revs = collapsed_revs | ||||
|                 # if collapse_user is false, we always yield | ||||
|                 else: | ||||
|                     yield prev_rev | ||||
|                 # Otherwise, collapse revision. | ||||
|                 collapsed_revs += 1 | ||||
|                 rev.collapsed_revs = collapsed_revs | ||||
| 
 | ||||
|             prev_rev = rev | ||||
| 
 | ||||
| @ -239,7 +232,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq | ||||
| 
 | ||||
| 
 | ||||
| @dataclass() | ||||
| class RevDataBase: | ||||
| class Revision: | ||||
|     revid: int | ||||
|     date_time: datetime | ||||
|     articleid: int | ||||
| @ -247,13 +240,13 @@ class RevDataBase: | ||||
|     title: str | ||||
|     namespace: int | ||||
|     deleted: bool | ||||
|     text_chars: int = None | ||||
|     revert: bool = None | ||||
|     text_chars: int | None = None | ||||
|     revert: bool | None = None | ||||
|     reverteds: list[int] = None | ||||
|     sha1: str = None | ||||
|     minor: bool = None | ||||
|     editor: str = None | ||||
|     anon: bool = None | ||||
|     sha1: str | None = None | ||||
|     minor: bool | None = None | ||||
|     editor: str | None = None | ||||
|     anon: bool | None = None | ||||
| 
 | ||||
|     # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation | ||||
|     urlencode = False | ||||
| @ -332,11 +325,11 @@ It just adds a new field and updates the pyarrow schema. | ||||
| 
 | ||||
| 
 | ||||
| @dataclass() | ||||
| class RevDataCollapse(RevDataBase): | ||||
| class RevDataCollapse(Revision): | ||||
|     collapsed_revs: int = None | ||||
| 
 | ||||
|     pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64()) | ||||
|     pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema] | ||||
|     pa_schema_fields = Revision.pa_schema_fields + [pa_collapsed_revs_schema] | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| @ -347,7 +340,7 @@ If persistence data is to be computed we'll need the fields added by RevDataPers | ||||
| 
 | ||||
| 
 | ||||
| @dataclass() | ||||
| class RevDataPersistence(RevDataBase): | ||||
| class RevDataPersistence(Revision): | ||||
|     token_revs: int = None | ||||
|     tokens_added: int = None | ||||
|     tokens_removed: int = None | ||||
| @ -359,7 +352,7 @@ class RevDataPersistence(RevDataBase): | ||||
|         pa.field("tokens_removed", pa.int64()), | ||||
|         pa.field("tokens_window", pa.int64())] | ||||
| 
 | ||||
|     pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields | ||||
|     pa_schema_fields = Revision.pa_schema_fields + pa_persistence_schema_fields | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| @ -394,7 +387,7 @@ class WikiqParser: | ||||
|         """ | ||||
|         self.input_file = input_file | ||||
| 
 | ||||
|         self.collapse_user = collapse_user | ||||
|         self.collapse_user: bool = collapse_user | ||||
|         self.persist: int = persist | ||||
|         self.namespaces = [] | ||||
|         self.urlencode: bool = urlencode | ||||
| @ -419,7 +412,7 @@ class WikiqParser: | ||||
|         elif self.persist != PersistMethod.none: | ||||
|             revdata_type = RevDataPersistence | ||||
|         else: | ||||
|             revdata_type = RevDataBase | ||||
|             revdata_type = Revision | ||||
| 
 | ||||
|         # if there are regex fields, we need to add them to the revdata type. | ||||
|         regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas] | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user