Timeout diffs.
This commit is contained in:
parent
730c678f51
commit
a563eaf6fc
@ -23,13 +23,14 @@ import pywikidiff2
|
|||||||
from deltas.tokenizers import wikitext_split
|
from deltas.tokenizers import wikitext_split
|
||||||
from more_itertools import ichunked
|
from more_itertools import ichunked
|
||||||
from mwxml import Dump
|
from mwxml import Dump
|
||||||
|
import asyncio
|
||||||
import wikiq.tables as tables
|
import wikiq.tables as tables
|
||||||
from wikiq.tables import RevisionTable
|
from wikiq.tables import RevisionTable
|
||||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
|
||||||
TO_ENCODE = ("title", "editor")
|
TO_ENCODE = ("title", "editor")
|
||||||
PERSISTENCE_RADIUS = 7
|
PERSISTENCE_RADIUS = 7
|
||||||
|
DIFF_TIMEOUT = 60*20
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
@ -46,6 +47,15 @@ class PersistMethod:
|
|||||||
wikidiff2 = 4
|
wikidiff2 = 4
|
||||||
|
|
||||||
|
|
||||||
|
async def diff_async(differ, last_text, text):
|
||||||
|
async def _diff():
|
||||||
|
return differ.inline_json_diff(last_text, text)
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
raise
|
||||||
|
return result
|
||||||
|
|
||||||
def calculate_persistence(tokens_added):
|
def calculate_persistence(tokens_added):
|
||||||
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
|
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
|
||||||
|
|
||||||
@ -493,11 +503,21 @@ class WikiqParser:
|
|||||||
num_context_lines=1000000,
|
num_context_lines=1000000,
|
||||||
max_word_level_diff_complexity=-1,
|
max_word_level_diff_complexity=-1,
|
||||||
moved_paragraph_detection_cutoff=-1,
|
moved_paragraph_detection_cutoff=-1,
|
||||||
|
words_cache_capacity=10000,
|
||||||
|
diff_cache_capacity=10000,
|
||||||
|
stats_cache_capacity=10000,
|
||||||
|
)
|
||||||
|
|
||||||
|
fast_differ = pywikidiff2.pywikidiff2(
|
||||||
|
num_context_lines=1000000,
|
||||||
|
max_word_level_diff_complexity=40000000,
|
||||||
|
moved_paragraph_detection_cutoff=100,
|
||||||
words_cache_capacity=-1,
|
words_cache_capacity=-1,
|
||||||
diff_cache_capacity=-1,
|
diff_cache_capacity=-1,
|
||||||
stats_cache_capacity=-1,
|
stats_cache_capacity=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
while not on_last_batch:
|
while not on_last_batch:
|
||||||
# first loop: next_batch <- batch;
|
# first loop: next_batch <- batch;
|
||||||
# second loop: next_batch <- batch; evaluate next_batch.
|
# second loop: next_batch <- batch; evaluate next_batch.
|
||||||
@ -657,7 +677,12 @@ class WikiqParser:
|
|||||||
last_text = last_rev_text
|
last_text = last_rev_text
|
||||||
new_diffs = []
|
new_diffs = []
|
||||||
for text in row_buffer["text"]:
|
for text in row_buffer["text"]:
|
||||||
new_diffs.append(differ.inline_json_diff(last_text, text))
|
try:
|
||||||
|
diff = asyncio.run(diff_async(differ, last_text, text))
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr)
|
||||||
|
diff = fast_differ.inline_json_diff(last_text, text)
|
||||||
|
new_diffs.append(diff)
|
||||||
last_text = text
|
last_text = text
|
||||||
row_buffer["diff"] = [
|
row_buffer["diff"] = [
|
||||||
[
|
[
|
||||||
|
Loading…
Reference in New Issue
Block a user