got wikidiff2 persistence working except for paragraph moves.

This commit is contained in:
Nathan TeBlunthuis
2025-06-30 15:37:54 -07:00
parent 186cb82fb8
commit 5a3e4102b5
6 changed files with 207 additions and 177 deletions

9
wikiq
View File

@@ -140,7 +140,6 @@ The pattern can include capture groups. If it does then each capture group will
If the pattern does not include a capture group, then only one output column will result.
"""
class RegexPair(object):
def __init__(self, pattern, label):
self.pattern = re.compile(pattern)
@@ -219,7 +218,7 @@ class WikiqParser:
revert_radius: int = 15,
output_parquet: bool = True,
parquet_buffer_size: int = 2000,
wikidiff_url: str = "",
wikidiff_url: str = "http://127.0.0.1:8000",
):
"""
@@ -450,9 +449,9 @@ class WikiqParser:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff:
state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
revision_texts,
tokenizer=wikitext_split),
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split,
self.wikidiff_url),
revert_radius=PERSISTENCE_RADIUS)
else:
from mw.lib import persistence