Factor out revision mutation logic into its own function
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									8c707f5ef3
								
							
						
					
					
						commit
						3e8ae205e8
					
				
							
								
								
									
										68
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								wikiq
									
									
									
									
									
								
							| @ -47,6 +47,18 @@ def calculate_persistence(tokens_added): | ||||
|             len(tokens_added)) | ||||
| 
 | ||||
| 
 | ||||
| def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]: | ||||
|     i = 0 | ||||
|     for rev in revs: | ||||
|         if rev.text is None: | ||||
|             rev.text = "" | ||||
|         if not rev.sha1 and not rev.deleted.text: | ||||
|             rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() | ||||
|         revs[i] = rev | ||||
|         i+=1 | ||||
|     return revs | ||||
| 
 | ||||
| 
 | ||||
| class WikiqIterator: | ||||
|     def __init__(self, fh, collapse_user=False): | ||||
|         self.fh = fh | ||||
| @ -370,16 +382,11 @@ class WikiqParser: | ||||
| 
 | ||||
|             # Iterate through a page's revisions | ||||
|             for revs in page: | ||||
|                 # Revisions may or may not be grouped into lists of contiguous revisions by the | ||||
|                 # same user. We call these "edit sessions". Otherwise revs is a list containing | ||||
|                 # exactly one revision. | ||||
|                 revs = list(revs) | ||||
|                 rev = revs[-1] | ||||
| 
 | ||||
|                 if rev.text is None: | ||||
|                     rev.text = "" | ||||
| 
 | ||||
|                 if not rev.sha1 and not rev.deleted.text: | ||||
|                     rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() | ||||
| 
 | ||||
|                 revs[-1] = rev | ||||
|                 revs = fix_hex_digests(revs) | ||||
| 
 | ||||
|                 table.add(page.mwpage, list(revs)) | ||||
| 
 | ||||
| @ -392,34 +399,37 @@ class WikiqParser: | ||||
| 
 | ||||
|                 rev_count += 1 | ||||
| 
 | ||||
|                 # Get the last revision in the edit session. | ||||
|                 rev = revs[-1] | ||||
|                 regex_dict = self.matchmake_revision(rev) | ||||
|                 for k, v in regex_dict.items(): | ||||
|                     if regex_matches.get(k) is None: | ||||
|                         regex_matches[k] = [] | ||||
|                     regex_matches[k].append(v) | ||||
| 
 | ||||
|             buffer = table.pop() | ||||
|             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||
|             row_buffer = table.pop() | ||||
| 
 | ||||
|             is_revert_column: list[bool | None] = [] | ||||
|             for r, d in zip(buffer['reverteds'], buffer['deleted']): | ||||
|             for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): | ||||
|                 if self.revert_radius == 0 or d: | ||||
|                     is_revert_column.append(None) | ||||
|                 else: | ||||
|                     is_revert_column.append(r is not None) | ||||
| 
 | ||||
|             buffer['revert'] = is_revert_column | ||||
|             row_buffer['revert'] = is_revert_column | ||||
| 
 | ||||
|             for k, v in regex_matches.items(): | ||||
|                 buffer[k] = v | ||||
|                 row_buffer[k] = v | ||||
|                 regex_matches = {} | ||||
| 
 | ||||
|             if self.persist != PersistMethod.none: | ||||
|                 window = deque(maxlen=PERSISTENCE_RADIUS) | ||||
| 
 | ||||
|                 buffer['token_revs'] = [] | ||||
|                 buffer['tokens_added'] = [] | ||||
|                 buffer['tokens_removed'] = [] | ||||
|                 buffer['tokens_window'] = [] | ||||
|                 row_buffer['token_revs'] = [] | ||||
|                 row_buffer['tokens_added'] = [] | ||||
|                 row_buffer['tokens_removed'] = [] | ||||
|                 row_buffer['tokens_window'] = [] | ||||
| 
 | ||||
|                 if self.persist == PersistMethod.sequence: | ||||
|                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), | ||||
| @ -431,8 +441,8 @@ class WikiqParser: | ||||
|                     from mw.lib import persistence | ||||
|                     state = persistence.State() | ||||
| 
 | ||||
|                 for idx, text in enumerate(buffer['text']): | ||||
|                     rev_id = buffer['revid'][idx] | ||||
|                 for idx, text in enumerate(row_buffer['text']): | ||||
|                     rev_id = row_buffer['revid'][idx] | ||||
|                     if self.persist != PersistMethod.legacy: | ||||
|                         _, tokens_added, tokens_removed = state.update(text, rev_id) | ||||
|                     else: | ||||
| @ -444,12 +454,12 @@ class WikiqParser: | ||||
|                         old_rev_id, old_tokens_added, old_tokens_removed = window.popleft() | ||||
|                         num_token_revs, num_tokens = calculate_persistence(old_tokens_added) | ||||
| 
 | ||||
|                         buffer['token_revs'].append(num_token_revs) | ||||
|                         buffer['tokens_added'].append(num_tokens) | ||||
|                         buffer['tokens_removed'].append(len(old_tokens_removed)) | ||||
|                         buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) | ||||
|                         row_buffer['token_revs'].append(num_token_revs) | ||||
|                         row_buffer['tokens_added'].append(num_tokens) | ||||
|                         row_buffer['tokens_removed'].append(len(old_tokens_removed)) | ||||
|                         row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) | ||||
| 
 | ||||
|                 del buffer['text'] | ||||
|                 del row_buffer['text'] | ||||
| 
 | ||||
|                 # print out metadata for the last RADIUS revisions | ||||
|                 for i, item in enumerate(window): | ||||
| @ -460,12 +470,12 @@ class WikiqParser: | ||||
|                     rev_id, tokens_added, tokens_removed = item | ||||
|                     num_token_revs, num_tokens = calculate_persistence(tokens_added) | ||||
| 
 | ||||
|                     buffer['token_revs'].append(num_token_revs) | ||||
|                     buffer['tokens_added'].append(num_tokens) | ||||
|                     buffer['tokens_removed'].append(len(tokens_removed)) | ||||
|                     buffer['tokens_window'].append(len(window) - (i + 1)) | ||||
|                     row_buffer['token_revs'].append(num_token_revs) | ||||
|                     row_buffer['tokens_added'].append(num_tokens) | ||||
|                     row_buffer['tokens_removed'].append(len(tokens_removed)) | ||||
|                     row_buffer['tokens_window'].append(len(window) - (i + 1)) | ||||
| 
 | ||||
|             writer.write(pa.table(buffer, schema=schema)) | ||||
|             writer.write(pa.table(row_buffer, schema=schema)) | ||||
| 
 | ||||
|             page_count += 1 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user