Pass arrays of diffs instead of incremental
This is 3.5x faster Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									96915a074b
								
							
						
					
					
						commit
						62db384aa4
					
				
							
								
								
									
										25
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								wikiq
									
									
									
									
									
								
							| @ -37,6 +37,7 @@ import pyarrow.csv as pacsv | ||||
| 
 | ||||
| DIFFS_URL = 'http://localhost:8000' | ||||
| 
 | ||||
| 
 | ||||
| class PersistMethod: | ||||
|     none = 0 | ||||
|     sequence = 1 | ||||
| @ -378,8 +379,7 @@ class WikiqParser: | ||||
| 
 | ||||
|         # Iterate through pages | ||||
|         for page in dump: | ||||
|             incremental_diffs = [] | ||||
|             previous_text = "" | ||||
|             payload = [] | ||||
| 
 | ||||
|             # skip namespaces not in the filter | ||||
|             if self.namespace_filter is not None: | ||||
| @ -420,15 +420,15 @@ class WikiqParser: | ||||
|                     regex_matches[k].append(v) | ||||
| 
 | ||||
|                 if self.compute_incremental_diffs: | ||||
|                     payload = { | ||||
|                         'arg1': previous_text, | ||||
|                         'arg2': rev.text, | ||||
|                     } | ||||
|                     payload.append(rev.text) | ||||
| 
 | ||||
|             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||
|             row_buffer = table.pop() | ||||
|             if self.compute_incremental_diffs: | ||||
|                 try: | ||||
|                     response = requests.post(DIFFS_URL, json=payload) | ||||
|                     response.raise_for_status() | ||||
|                         incremental_diffs.append(response.text) | ||||
|                     incremental_diffs = response.json() | ||||
|                 except requests.exceptions.ConnectionError as e: | ||||
|                     print( | ||||
|                         f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") | ||||
| @ -438,15 +438,16 @@ class WikiqParser: | ||||
|                     print(f"HTTP Error: {e}") | ||||
|                     print(f"Response Body: {response.text}") | ||||
|                     raise e | ||||
|                 except requests.exceptions.JSONDecodeError as e: | ||||
|                     # Must come before RequestException as JSONDecodeError is | ||||
|                     # a subclass. | ||||
|                     print(f"JSON Decode Error: {e}", file=sys.stderr) | ||||
|                     print(f"Response Body: {response.text}", file=sys.stderr) | ||||
|                     raise e | ||||
|                 except requests.exceptions.RequestException as e: | ||||
|                     print(f"An unexpected error occurred: {e}") | ||||
|                     raise e | ||||
| 
 | ||||
|                     previous_text = rev.text | ||||
| 
 | ||||
|             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||
|             row_buffer = table.pop() | ||||
|             if self.compute_incremental_diffs: | ||||
|                 row_buffer['incremental diffs'] = incremental_diffs | ||||
| 
 | ||||
|             is_revert_column: list[Union[bool, None]] = [] | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user