Pass arrays of diffs instead of incremental
This is 3.5x faster Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									96915a074b
								
							
						
					
					
						commit
						62db384aa4
					
				
							
								
								
									
										53
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										53
									
								
								wikiq
									
									
									
									
									
								
							| @ -37,6 +37,7 @@ import pyarrow.csv as pacsv | |||||||
| 
 | 
 | ||||||
| DIFFS_URL = 'http://localhost:8000' | DIFFS_URL = 'http://localhost:8000' | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| class PersistMethod: | class PersistMethod: | ||||||
|     none = 0 |     none = 0 | ||||||
|     sequence = 1 |     sequence = 1 | ||||||
| @ -57,7 +58,7 @@ def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]: | |||||||
|         if not rev.sha1 and not rev.deleted.text: |         if not rev.sha1 and not rev.deleted.text: | ||||||
|             rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() |             rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() | ||||||
|         revs[i] = rev |         revs[i] = rev | ||||||
|         i+=1 |         i += 1 | ||||||
|     return revs |     return revs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -378,8 +379,7 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|         # Iterate through pages |         # Iterate through pages | ||||||
|         for page in dump: |         for page in dump: | ||||||
|             incremental_diffs = [] |             payload = [] | ||||||
|             previous_text = "" |  | ||||||
| 
 | 
 | ||||||
|             # skip namespaces not in the filter |             # skip namespaces not in the filter | ||||||
|             if self.namespace_filter is not None: |             if self.namespace_filter is not None: | ||||||
| @ -420,33 +420,34 @@ class WikiqParser: | |||||||
|                     regex_matches[k].append(v) |                     regex_matches[k].append(v) | ||||||
| 
 | 
 | ||||||
|                 if self.compute_incremental_diffs: |                 if self.compute_incremental_diffs: | ||||||
|                     payload = { |                     payload.append(rev.text) | ||||||
|                         'arg1': previous_text, |  | ||||||
|                         'arg2': rev.text, |  | ||||||
|                     } |  | ||||||
| 
 |  | ||||||
|                     try: |  | ||||||
|                         response = requests.post(DIFFS_URL, json=payload) |  | ||||||
|                         response.raise_for_status() |  | ||||||
|                         incremental_diffs.append(response.text) |  | ||||||
|                     except requests.exceptions.ConnectionError as e: |  | ||||||
|                         print( |  | ||||||
|                             f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") |  | ||||||
|                         print(e) |  | ||||||
|                         raise e |  | ||||||
|                     except requests.exceptions.HTTPError as e: |  | ||||||
|                         print(f"HTTP Error: {e}") |  | ||||||
|                         print(f"Response Body: {response.text}") |  | ||||||
|                         raise e |  | ||||||
|                     except requests.exceptions.RequestException as e: |  | ||||||
|                         print(f"An unexpected error occurred: {e}") |  | ||||||
|                         raise e |  | ||||||
| 
 |  | ||||||
|                     previous_text = rev.text |  | ||||||
| 
 | 
 | ||||||
|             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. |             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||||
|             row_buffer = table.pop() |             row_buffer = table.pop() | ||||||
|             if self.compute_incremental_diffs: |             if self.compute_incremental_diffs: | ||||||
|  |                 try: | ||||||
|  |                     response = requests.post(DIFFS_URL, json=payload) | ||||||
|  |                     response.raise_for_status() | ||||||
|  |                     incremental_diffs = response.json() | ||||||
|  |                 except requests.exceptions.ConnectionError as e: | ||||||
|  |                     print( | ||||||
|  |                         f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") | ||||||
|  |                     print(e) | ||||||
|  |                     raise e | ||||||
|  |                 except requests.exceptions.HTTPError as e: | ||||||
|  |                     print(f"HTTP Error: {e}") | ||||||
|  |                     print(f"Response Body: {response.text}") | ||||||
|  |                     raise e | ||||||
|  |                 except requests.exceptions.JSONDecodeError as e: | ||||||
|  |                     # Must come before RequestException as JSONDecodeError is | ||||||
|  |                     # a subclass. | ||||||
|  |                     print(f"JSON Decode Error: {e}", file=sys.stderr) | ||||||
|  |                     print(f"Response Body: {response.text}", file=sys.stderr) | ||||||
|  |                     raise e | ||||||
|  |                 except requests.exceptions.RequestException as e: | ||||||
|  |                     print(f"An unexpected error occurred: {e}") | ||||||
|  |                     raise e | ||||||
|  | 
 | ||||||
|                 row_buffer['incremental diffs'] = incremental_diffs |                 row_buffer['incremental diffs'] = incremental_diffs | ||||||
| 
 | 
 | ||||||
|             is_revert_column: list[Union[bool, None]] = [] |             is_revert_column: list[Union[bool, None]] = [] | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user