Timeout diffs.
This commit is contained in:
		
							parent
							
								
									730c678f51
								
							
						
					
					
						commit
						a563eaf6fc
					
				| @ -23,13 +23,14 @@ import pywikidiff2 | |||||||
| from deltas.tokenizers import wikitext_split | from deltas.tokenizers import wikitext_split | ||||||
| from more_itertools import ichunked | from more_itertools import ichunked | ||||||
| from mwxml import Dump | from mwxml import Dump | ||||||
| 
 | import asyncio | ||||||
| import wikiq.tables as tables | import wikiq.tables as tables | ||||||
| from wikiq.tables import RevisionTable | from wikiq.tables import RevisionTable | ||||||
| from wikiq.wiki_diff_matcher import WikiDiffMatcher | from wikiq.wiki_diff_matcher import WikiDiffMatcher | ||||||
| 
 | 
 | ||||||
| TO_ENCODE = ("title", "editor") | TO_ENCODE = ("title", "editor") | ||||||
| PERSISTENCE_RADIUS = 7 | PERSISTENCE_RADIUS = 7 | ||||||
|  | DIFF_TIMEOUT = 60*20 | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| import pyarrow as pa | import pyarrow as pa | ||||||
| @ -46,6 +47,15 @@ class PersistMethod: | |||||||
|     wikidiff2 = 4 |     wikidiff2 = 4 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | async def diff_async(differ, last_text, text): | ||||||
|  |     async def _diff(): | ||||||
|  |         return differ.inline_json_diff(last_text, text) | ||||||
|  |     try: | ||||||
|  |         result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT) | ||||||
|  |     except asyncio.TimeoutError: | ||||||
|  |         raise | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
| def calculate_persistence(tokens_added): | def calculate_persistence(tokens_added): | ||||||
|     return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) |     return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) | ||||||
| 
 | 
 | ||||||
| @ -493,11 +503,21 @@ class WikiqParser: | |||||||
|                     num_context_lines=1000000, |                     num_context_lines=1000000, | ||||||
|                     max_word_level_diff_complexity=-1, |                     max_word_level_diff_complexity=-1, | ||||||
|                     moved_paragraph_detection_cutoff=-1, |                     moved_paragraph_detection_cutoff=-1, | ||||||
|  |                     words_cache_capacity=10000, | ||||||
|  |                     diff_cache_capacity=10000, | ||||||
|  |                     stats_cache_capacity=10000, | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |                 fast_differ = pywikidiff2.pywikidiff2( | ||||||
|  |                     num_context_lines=1000000, | ||||||
|  |                     max_word_level_diff_complexity=40000000, | ||||||
|  |                     moved_paragraph_detection_cutoff=100, | ||||||
|                     words_cache_capacity=-1, |                     words_cache_capacity=-1, | ||||||
|                     diff_cache_capacity=-1, |                     diff_cache_capacity=-1, | ||||||
|                     stats_cache_capacity=-1, |                     stats_cache_capacity=-1, | ||||||
|                 ) |                 ) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|             while not on_last_batch: |             while not on_last_batch: | ||||||
|                 # first loop: next_batch <- batch; |                 # first loop: next_batch <- batch; | ||||||
|                 # second loop: next_batch <- batch; evaluate next_batch. |                 # second loop: next_batch <- batch; evaluate next_batch. | ||||||
| @ -657,7 +677,12 @@ class WikiqParser: | |||||||
|                     last_text = last_rev_text |                     last_text = last_rev_text | ||||||
|                     new_diffs = [] |                     new_diffs = [] | ||||||
|                     for text in row_buffer["text"]: |                     for text in row_buffer["text"]: | ||||||
|                         new_diffs.append(differ.inline_json_diff(last_text, text)) |                         try: | ||||||
|  |                             diff = asyncio.run(diff_async(differ, last_text, text)) | ||||||
|  |                         except asyncio.TimeoutError: | ||||||
|  |                             print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr) | ||||||
|  |                             diff = fast_differ.inline_json_diff(last_text, text) | ||||||
|  |                         new_diffs.append(diff) | ||||||
|                         last_text = text |                         last_text = text | ||||||
|                     row_buffer["diff"] = [ |                     row_buffer["diff"] = [ | ||||||
|                         [ |                         [ | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user