Timeout diffs.
This commit is contained in:
parent
730c678f51
commit
a563eaf6fc
@ -23,13 +23,14 @@ import pywikidiff2
|
||||
from deltas.tokenizers import wikitext_split
|
||||
from more_itertools import ichunked
|
||||
from mwxml import Dump
|
||||
|
||||
import asyncio
|
||||
import wikiq.tables as tables
|
||||
from wikiq.tables import RevisionTable
|
||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||
|
||||
TO_ENCODE = ("title", "editor")
|
||||
PERSISTENCE_RADIUS = 7
|
||||
DIFF_TIMEOUT = 60*20
|
||||
from pathlib import Path
|
||||
|
||||
import pyarrow as pa
|
||||
@ -46,6 +47,15 @@ class PersistMethod:
|
||||
wikidiff2 = 4
|
||||
|
||||
|
||||
async def diff_async(differ, last_text, text):
|
||||
async def _diff():
|
||||
return differ.inline_json_diff(last_text, text)
|
||||
try:
|
||||
result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT)
|
||||
except asyncio.TimeoutError:
|
||||
raise
|
||||
return result
|
||||
|
||||
def calculate_persistence(tokens_added):
|
||||
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
|
||||
|
||||
@ -493,11 +503,21 @@ class WikiqParser:
|
||||
num_context_lines=1000000,
|
||||
max_word_level_diff_complexity=-1,
|
||||
moved_paragraph_detection_cutoff=-1,
|
||||
words_cache_capacity=10000,
|
||||
diff_cache_capacity=10000,
|
||||
stats_cache_capacity=10000,
|
||||
)
|
||||
|
||||
fast_differ = pywikidiff2.pywikidiff2(
|
||||
num_context_lines=1000000,
|
||||
max_word_level_diff_complexity=40000000,
|
||||
moved_paragraph_detection_cutoff=100,
|
||||
words_cache_capacity=-1,
|
||||
diff_cache_capacity=-1,
|
||||
stats_cache_capacity=-1,
|
||||
)
|
||||
|
||||
|
||||
while not on_last_batch:
|
||||
# first loop: next_batch <- batch;
|
||||
# second loop: next_batch <- batch; evaluate next_batch.
|
||||
@ -657,7 +677,12 @@ class WikiqParser:
|
||||
last_text = last_rev_text
|
||||
new_diffs = []
|
||||
for text in row_buffer["text"]:
|
||||
new_diffs.append(differ.inline_json_diff(last_text, text))
|
||||
try:
|
||||
diff = asyncio.run(diff_async(differ, last_text, text))
|
||||
except asyncio.TimeoutError:
|
||||
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr)
|
||||
diff = fast_differ.inline_json_diff(last_text, text)
|
||||
new_diffs.append(diff)
|
||||
last_text = text
|
||||
row_buffer["diff"] = [
|
||||
[
|
||||
|
Loading…
Reference in New Issue
Block a user