Factor out revision mutation logic into its own function
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									8c707f5ef3
								
							
						
					
					
						commit
						3e8ae205e8
					
				
							
								
								
									
										68
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								wikiq
									
									
									
									
									
								
							| @ -47,6 +47,18 @@ def calculate_persistence(tokens_added): | |||||||
|             len(tokens_added)) |             len(tokens_added)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]: | ||||||
|  |     i = 0 | ||||||
|  |     for rev in revs: | ||||||
|  |         if rev.text is None: | ||||||
|  |             rev.text = "" | ||||||
|  |         if not rev.sha1 and not rev.deleted.text: | ||||||
|  |             rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() | ||||||
|  |         revs[i] = rev | ||||||
|  |         i+=1 | ||||||
|  |     return revs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class WikiqIterator: | class WikiqIterator: | ||||||
|     def __init__(self, fh, collapse_user=False): |     def __init__(self, fh, collapse_user=False): | ||||||
|         self.fh = fh |         self.fh = fh | ||||||
| @ -370,16 +382,11 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|             # Iterate through a page's revisions |             # Iterate through a page's revisions | ||||||
|             for revs in page: |             for revs in page: | ||||||
|  |                 # Revisions may or may not be grouped into lists of contiguous revisions by the | ||||||
|  |                 # same user. We call these "edit sessions". Otherwise revs is a list containing | ||||||
|  |                 # exactly one revision. | ||||||
|                 revs = list(revs) |                 revs = list(revs) | ||||||
|                 rev = revs[-1] |                 revs = fix_hex_digests(revs) | ||||||
| 
 |  | ||||||
|                 if rev.text is None: |  | ||||||
|                     rev.text = "" |  | ||||||
| 
 |  | ||||||
|                 if not rev.sha1 and not rev.deleted.text: |  | ||||||
|                     rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() |  | ||||||
| 
 |  | ||||||
|                 revs[-1] = rev |  | ||||||
| 
 | 
 | ||||||
|                 table.add(page.mwpage, list(revs)) |                 table.add(page.mwpage, list(revs)) | ||||||
| 
 | 
 | ||||||
| @ -392,34 +399,37 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|                 rev_count += 1 |                 rev_count += 1 | ||||||
| 
 | 
 | ||||||
|  |                 # Get the last revision in the edit session. | ||||||
|  |                 rev = revs[-1] | ||||||
|                 regex_dict = self.matchmake_revision(rev) |                 regex_dict = self.matchmake_revision(rev) | ||||||
|                 for k, v in regex_dict.items(): |                 for k, v in regex_dict.items(): | ||||||
|                     if regex_matches.get(k) is None: |                     if regex_matches.get(k) is None: | ||||||
|                         regex_matches[k] = [] |                         regex_matches[k] = [] | ||||||
|                     regex_matches[k].append(v) |                     regex_matches[k].append(v) | ||||||
| 
 | 
 | ||||||
|             buffer = table.pop() |             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||||
|  |             row_buffer = table.pop() | ||||||
| 
 | 
 | ||||||
|             is_revert_column: list[bool | None] = [] |             is_revert_column: list[bool | None] = [] | ||||||
|             for r, d in zip(buffer['reverteds'], buffer['deleted']): |             for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): | ||||||
|                 if self.revert_radius == 0 or d: |                 if self.revert_radius == 0 or d: | ||||||
|                     is_revert_column.append(None) |                     is_revert_column.append(None) | ||||||
|                 else: |                 else: | ||||||
|                     is_revert_column.append(r is not None) |                     is_revert_column.append(r is not None) | ||||||
| 
 | 
 | ||||||
|             buffer['revert'] = is_revert_column |             row_buffer['revert'] = is_revert_column | ||||||
| 
 | 
 | ||||||
|             for k, v in regex_matches.items(): |             for k, v in regex_matches.items(): | ||||||
|                 buffer[k] = v |                 row_buffer[k] = v | ||||||
|                 regex_matches = {} |                 regex_matches = {} | ||||||
| 
 | 
 | ||||||
|             if self.persist != PersistMethod.none: |             if self.persist != PersistMethod.none: | ||||||
|                 window = deque(maxlen=PERSISTENCE_RADIUS) |                 window = deque(maxlen=PERSISTENCE_RADIUS) | ||||||
| 
 | 
 | ||||||
|                 buffer['token_revs'] = [] |                 row_buffer['token_revs'] = [] | ||||||
|                 buffer['tokens_added'] = [] |                 row_buffer['tokens_added'] = [] | ||||||
|                 buffer['tokens_removed'] = [] |                 row_buffer['tokens_removed'] = [] | ||||||
|                 buffer['tokens_window'] = [] |                 row_buffer['tokens_window'] = [] | ||||||
| 
 | 
 | ||||||
|                 if self.persist == PersistMethod.sequence: |                 if self.persist == PersistMethod.sequence: | ||||||
|                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), |                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), | ||||||
| @ -431,8 +441,8 @@ class WikiqParser: | |||||||
|                     from mw.lib import persistence |                     from mw.lib import persistence | ||||||
|                     state = persistence.State() |                     state = persistence.State() | ||||||
| 
 | 
 | ||||||
|                 for idx, text in enumerate(buffer['text']): |                 for idx, text in enumerate(row_buffer['text']): | ||||||
|                     rev_id = buffer['revid'][idx] |                     rev_id = row_buffer['revid'][idx] | ||||||
|                     if self.persist != PersistMethod.legacy: |                     if self.persist != PersistMethod.legacy: | ||||||
|                         _, tokens_added, tokens_removed = state.update(text, rev_id) |                         _, tokens_added, tokens_removed = state.update(text, rev_id) | ||||||
|                     else: |                     else: | ||||||
| @ -444,12 +454,12 @@ class WikiqParser: | |||||||
|                         old_rev_id, old_tokens_added, old_tokens_removed = window.popleft() |                         old_rev_id, old_tokens_added, old_tokens_removed = window.popleft() | ||||||
|                         num_token_revs, num_tokens = calculate_persistence(old_tokens_added) |                         num_token_revs, num_tokens = calculate_persistence(old_tokens_added) | ||||||
| 
 | 
 | ||||||
|                         buffer['token_revs'].append(num_token_revs) |                         row_buffer['token_revs'].append(num_token_revs) | ||||||
|                         buffer['tokens_added'].append(num_tokens) |                         row_buffer['tokens_added'].append(num_tokens) | ||||||
|                         buffer['tokens_removed'].append(len(old_tokens_removed)) |                         row_buffer['tokens_removed'].append(len(old_tokens_removed)) | ||||||
|                         buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) |                         row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) | ||||||
| 
 | 
 | ||||||
|                 del buffer['text'] |                 del row_buffer['text'] | ||||||
| 
 | 
 | ||||||
|                 # print out metadata for the last RADIUS revisions |                 # print out metadata for the last RADIUS revisions | ||||||
|                 for i, item in enumerate(window): |                 for i, item in enumerate(window): | ||||||
| @ -460,12 +470,12 @@ class WikiqParser: | |||||||
|                     rev_id, tokens_added, tokens_removed = item |                     rev_id, tokens_added, tokens_removed = item | ||||||
|                     num_token_revs, num_tokens = calculate_persistence(tokens_added) |                     num_token_revs, num_tokens = calculate_persistence(tokens_added) | ||||||
| 
 | 
 | ||||||
|                     buffer['token_revs'].append(num_token_revs) |                     row_buffer['token_revs'].append(num_token_revs) | ||||||
|                     buffer['tokens_added'].append(num_tokens) |                     row_buffer['tokens_added'].append(num_tokens) | ||||||
|                     buffer['tokens_removed'].append(len(tokens_removed)) |                     row_buffer['tokens_removed'].append(len(tokens_removed)) | ||||||
|                     buffer['tokens_window'].append(len(window) - (i + 1)) |                     row_buffer['tokens_window'].append(len(window) - (i + 1)) | ||||||
| 
 | 
 | ||||||
|             writer.write(pa.table(buffer, schema=schema)) |             writer.write(pa.table(row_buffer, schema=schema)) | ||||||
| 
 | 
 | ||||||
|             page_count += 1 |             page_count += 1 | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user