Timeout diffs.
This commit is contained in:
		
							parent
							
								
									730c678f51
								
							
						
					
					
						commit
						a563eaf6fc
					
				| @ -23,13 +23,14 @@ import pywikidiff2 | ||||
| from deltas.tokenizers import wikitext_split | ||||
| from more_itertools import ichunked | ||||
| from mwxml import Dump | ||||
| 
 | ||||
| import asyncio | ||||
| import wikiq.tables as tables | ||||
| from wikiq.tables import RevisionTable | ||||
| from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||
| 
 | ||||
| TO_ENCODE = ("title", "editor") | ||||
| PERSISTENCE_RADIUS = 7 | ||||
| DIFF_TIMEOUT = 60*20 | ||||
| from pathlib import Path | ||||
| 
 | ||||
| import pyarrow as pa | ||||
| @ -46,6 +47,15 @@ class PersistMethod: | ||||
|     wikidiff2 = 4 | ||||
| 
 | ||||
| 
 | ||||
| async def diff_async(differ, last_text, text): | ||||
|     async def _diff(): | ||||
|         return differ.inline_json_diff(last_text, text) | ||||
|     try: | ||||
|         result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT) | ||||
|     except asyncio.TimeoutError: | ||||
|         raise | ||||
|     return result | ||||
| 
 | ||||
| def calculate_persistence(tokens_added): | ||||
|     return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) | ||||
| 
 | ||||
| @ -493,11 +503,21 @@ class WikiqParser: | ||||
|                     num_context_lines=1000000, | ||||
|                     max_word_level_diff_complexity=-1, | ||||
|                     moved_paragraph_detection_cutoff=-1, | ||||
|                     words_cache_capacity=10000, | ||||
|                     diff_cache_capacity=10000, | ||||
|                     stats_cache_capacity=10000, | ||||
|                 ) | ||||
| 
 | ||||
|                 fast_differ = pywikidiff2.pywikidiff2( | ||||
|                     num_context_lines=1000000, | ||||
|                     max_word_level_diff_complexity=40000000, | ||||
|                     moved_paragraph_detection_cutoff=100, | ||||
|                     words_cache_capacity=-1, | ||||
|                     diff_cache_capacity=-1, | ||||
|                     stats_cache_capacity=-1, | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
|             while not on_last_batch: | ||||
|                 # first loop: next_batch <- batch; | ||||
|                 # second loop: next_batch <- batch; evaluate next_batch. | ||||
| @ -657,7 +677,12 @@ class WikiqParser: | ||||
|                     last_text = last_rev_text | ||||
|                     new_diffs = [] | ||||
|                     for text in row_buffer["text"]: | ||||
|                         new_diffs.append(differ.inline_json_diff(last_text, text)) | ||||
|                         try: | ||||
|                             diff = asyncio.run(diff_async(differ, last_text, text)) | ||||
|                         except asyncio.TimeoutError: | ||||
|                             print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr) | ||||
|                             diff = fast_differ.inline_json_diff(last_text, text) | ||||
|                         new_diffs.append(diff) | ||||
|                         last_text = text | ||||
|                     row_buffer["diff"] = [ | ||||
|                         [ | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user