Refactor collapse user logic
Use simple loop for when we aren't collapsing users. Add test which covers case when users are deleted. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									c0e629a313
								
							
						
					
					
						commit
						aec6e5fafa
					
				| @ -177,6 +177,19 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         baseline = pd.read_table(tester.baseline_file) |         baseline = pd.read_table(tester.baseline_file) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|  |     def test_WP_collapse_user(self): | ||||||
|  |         tester = WikiqTester(IKWIKI, "collapse_user") | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             tester.call_wikiq("--collapse-user") | ||||||
|  |         except subprocess.CalledProcessError as exc: | ||||||
|  |             self.fail(exc.stderr.decode("utf8")) | ||||||
|  | 
 | ||||||
|  |         copyfile(tester.call_output, tester.test_file) | ||||||
|  |         test = pd.read_table(tester.test_file) | ||||||
|  |         baseline = pd.read_table(tester.baseline_file) | ||||||
|  |         assert_frame_equal(test, baseline, check_like=True) | ||||||
|  | 
 | ||||||
|     def test_noargs(self): |     def test_noargs(self): | ||||||
|         tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") |         tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										59
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										59
									
								
								wikiq
									
									
									
									
									
								
							| @ -102,35 +102,28 @@ class WikiqPage: | |||||||
|         #         3          A               B            True |         #         3          A               B            True | ||||||
|         #         4          A               A           False |         #         4          A               A           False | ||||||
|         # Post-loop                          A          Always |         # Post-loop                          A          Always | ||||||
|         collapsed_revs = 0 |  | ||||||
|         for i, rev in enumerate(self.mwpage): |  | ||||||
|             # never yield the first time |  | ||||||
|             if i == 0: |  | ||||||
|                 if self.collapse_user: |  | ||||||
|                     collapsed_revs = 1 |  | ||||||
|                     rev.collapsed_revs = collapsed_revs |  | ||||||
| 
 | 
 | ||||||
|             else: |         if not self.collapse_user: | ||||||
|                 if self.collapse_user: |             for rev in self.mwpage: | ||||||
|  |                 yield rev | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         collapsed_revs = 1 | ||||||
|  |         prev_rev = next(self.mwpage) | ||||||
|  |         prev_rev.collapsed_revs = collapsed_revs | ||||||
|  | 
 | ||||||
|  |         for rev in self.mwpage: | ||||||
|             # yield if this is the last edit in a seq by a user and reset |             # yield if this is the last edit in a seq by a user and reset | ||||||
|                     # also yield if we do know who the user is |             # also yield if we don't know who the user is | ||||||
| 
 | 
 | ||||||
|                     if rev.deleted.user or prev_rev.deleted.user: |             if rev.deleted.user or prev_rev.deleted.user or rev.user.text != prev_rev.user.text: | ||||||
|                 yield prev_rev |                 yield prev_rev | ||||||
|                 collapsed_revs = 1 |                 collapsed_revs = 1 | ||||||
|                 rev.collapsed_revs = collapsed_revs |                 rev.collapsed_revs = collapsed_revs | ||||||
| 
 |  | ||||||
|                     elif not rev.user.text == prev_rev.user.text: |  | ||||||
|                         yield prev_rev |  | ||||||
|                         collapsed_revs = 1 |  | ||||||
|                         rev.collapsed_revs = collapsed_revs |  | ||||||
|                     # otherwise, add one to the counter |  | ||||||
|             else: |             else: | ||||||
|  |                 # Otherwise, collapse revision. | ||||||
|                 collapsed_revs += 1 |                 collapsed_revs += 1 | ||||||
|                 rev.collapsed_revs = collapsed_revs |                 rev.collapsed_revs = collapsed_revs | ||||||
|                 # if collapse_user is false, we always yield |  | ||||||
|                 else: |  | ||||||
|                     yield prev_rev |  | ||||||
| 
 | 
 | ||||||
|             prev_rev = rev |             prev_rev = rev | ||||||
| 
 | 
 | ||||||
| @ -239,7 +232,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass() | @dataclass() | ||||||
| class RevDataBase: | class Revision: | ||||||
|     revid: int |     revid: int | ||||||
|     date_time: datetime |     date_time: datetime | ||||||
|     articleid: int |     articleid: int | ||||||
| @ -247,13 +240,13 @@ class RevDataBase: | |||||||
|     title: str |     title: str | ||||||
|     namespace: int |     namespace: int | ||||||
|     deleted: bool |     deleted: bool | ||||||
|     text_chars: int = None |     text_chars: int | None = None | ||||||
|     revert: bool = None |     revert: bool | None = None | ||||||
|     reverteds: list[int] = None |     reverteds: list[int] = None | ||||||
|     sha1: str = None |     sha1: str | None = None | ||||||
|     minor: bool = None |     minor: bool | None = None | ||||||
|     editor: str = None |     editor: str | None = None | ||||||
|     anon: bool = None |     anon: bool | None = None | ||||||
| 
 | 
 | ||||||
|     # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation |     # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation | ||||||
|     urlencode = False |     urlencode = False | ||||||
| @ -332,11 +325,11 @@ It just adds a new field and updates the pyarrow schema. | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass() | @dataclass() | ||||||
| class RevDataCollapse(RevDataBase): | class RevDataCollapse(Revision): | ||||||
|     collapsed_revs: int = None |     collapsed_revs: int = None | ||||||
| 
 | 
 | ||||||
|     pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64()) |     pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64()) | ||||||
|     pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema] |     pa_schema_fields = Revision.pa_schema_fields + [pa_collapsed_revs_schema] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| @ -347,7 +340,7 @@ If persistence data is to be computed we'll need the fields added by RevDataPers | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass() | @dataclass() | ||||||
| class RevDataPersistence(RevDataBase): | class RevDataPersistence(Revision): | ||||||
|     token_revs: int = None |     token_revs: int = None | ||||||
|     tokens_added: int = None |     tokens_added: int = None | ||||||
|     tokens_removed: int = None |     tokens_removed: int = None | ||||||
| @ -359,7 +352,7 @@ class RevDataPersistence(RevDataBase): | |||||||
|         pa.field("tokens_removed", pa.int64()), |         pa.field("tokens_removed", pa.int64()), | ||||||
|         pa.field("tokens_window", pa.int64())] |         pa.field("tokens_window", pa.int64())] | ||||||
| 
 | 
 | ||||||
|     pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields |     pa_schema_fields = Revision.pa_schema_fields + pa_persistence_schema_fields | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| @ -394,7 +387,7 @@ class WikiqParser: | |||||||
|         """ |         """ | ||||||
|         self.input_file = input_file |         self.input_file = input_file | ||||||
| 
 | 
 | ||||||
|         self.collapse_user = collapse_user |         self.collapse_user: bool = collapse_user | ||||||
|         self.persist: int = persist |         self.persist: int = persist | ||||||
|         self.namespaces = [] |         self.namespaces = [] | ||||||
|         self.urlencode: bool = urlencode |         self.urlencode: bool = urlencode | ||||||
| @ -419,7 +412,7 @@ class WikiqParser: | |||||||
|         elif self.persist != PersistMethod.none: |         elif self.persist != PersistMethod.none: | ||||||
|             revdata_type = RevDataPersistence |             revdata_type = RevDataPersistence | ||||||
|         else: |         else: | ||||||
|             revdata_type = RevDataBase |             revdata_type = Revision | ||||||
| 
 | 
 | ||||||
|         # if there are regex fields, we need to add them to the revdata type. |         # if there are regex fields, we need to add them to the revdata type. | ||||||
|         regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas] |         regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas] | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user