diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 03e57e2..93cfe49 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -23,13 +23,14 @@ import pywikidiff2 from deltas.tokenizers import wikitext_split from more_itertools import ichunked from mwxml import Dump - +import asyncio import wikiq.tables as tables from wikiq.tables import RevisionTable from wikiq.wiki_diff_matcher import WikiDiffMatcher TO_ENCODE = ("title", "editor") PERSISTENCE_RADIUS = 7 +DIFF_TIMEOUT = 60*20 from pathlib import Path import pyarrow as pa @@ -46,6 +47,15 @@ class PersistMethod: wikidiff2 = 4 +async def diff_async(differ, last_text, text): + async def _diff(): + return differ.inline_json_diff(last_text, text) + try: + result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT) + except asyncio.TimeoutError: + raise + return result + def calculate_persistence(tokens_added): return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) @@ -493,11 +503,21 @@ class WikiqParser: num_context_lines=1000000, max_word_level_diff_complexity=-1, moved_paragraph_detection_cutoff=-1, + words_cache_capacity=10000, + diff_cache_capacity=10000, + stats_cache_capacity=10000, + ) + + fast_differ = pywikidiff2.pywikidiff2( + num_context_lines=1000000, + max_word_level_diff_complexity=40000000, + moved_paragraph_detection_cutoff=100, words_cache_capacity=-1, diff_cache_capacity=-1, stats_cache_capacity=-1, ) + while not on_last_batch: # first loop: next_batch <- batch; # second loop: next_batch <- batch; evaluate next_batch. @@ -657,7 +677,12 @@ class WikiqParser: last_text = last_rev_text new_diffs = [] for text in row_buffer["text"]: - new_diffs.append(differ.inline_json_diff(last_text, text)) + try: + diff = asyncio.run(diff_async(differ, last_text, text)) + except asyncio.TimeoutError: + print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr) + diff = fast_differ.inline_json_diff(last_text, text) + new_diffs.append(diff) last_text = text row_buffer["diff"] = [ [