got wikidiff2 persistence working except for paragraph moves.
This commit is contained in:
parent
186cb82fb8
commit
5a3e4102b5
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "mediawiki-php-wikidiff2"]
|
||||||
|
path = mediawiki-php-wikidiff2
|
||||||
|
url = https://github.com/wikimedia/mediawiki-php-wikidiff2/
|
@ -20,6 +20,13 @@ associated tests to work.
|
|||||||
- 7zip
|
- 7zip
|
||||||
- ffmpeg
|
- ffmpeg
|
||||||
|
|
||||||
|
A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via.
|
||||||
|
|
||||||
|
``apt-get install php-wikidiff2``
|
||||||
|
|
||||||
|
You may have to also run.
|
||||||
|
``sudo phpenmod wikidiff2``.
|
||||||
|
|
||||||
Tests
|
Tests
|
||||||
----
|
----
|
||||||
To run tests::
|
To run tests::
|
||||||
@ -30,3 +37,5 @@ TODO:
|
|||||||
_______________
|
_______________
|
||||||
1. [] Output metadata about the run. What parameters were used? What versions of deltas?
|
1. [] Output metadata about the run. What parameters were used? What versions of deltas?
|
||||||
2. [] Url encoding by default
|
2. [] Url encoding by default
|
||||||
|
|
||||||
|
.. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2
|
||||||
|
@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = "~=3.9"
|
requires-python = ">=3.9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deltas>=0.7.0",
|
"deltas>=0.7.0",
|
||||||
"mediawiki-utilities>=0.4.18",
|
"mediawiki-utilities>=0.4.18",
|
||||||
@ -18,8 +18,11 @@ dependencies = [
|
|||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
||||||
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
||||||
|
deltas = { git = "https://github.com/groceryheist/deltas" }
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
"pandas>=2.1.0"
|
"pandas>=2.1.0",
|
||||||
|
"pytest>=8.4.1",
|
||||||
|
"pytest-asyncio>=1.0.0",
|
||||||
]
|
]
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
from itertools import chain
|
||||||
|
from typing import Generator, List, Optional, Tuple
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
|
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
||||||
|
RegexTokenizer, Token, tokenizers)
|
||||||
|
|
||||||
TOKENIZER = tokenizers.wikitext_split
|
TOKENIZER = tokenizers.wikitext_split
|
||||||
|
|
||||||
@ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list:
|
|||||||
incremental_diffs = response.json()
|
incremental_diffs = response.json()
|
||||||
except requests.exceptions.ConnectionError as e:
|
except requests.exceptions.ConnectionError as e:
|
||||||
print(
|
print(
|
||||||
f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
|
f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
|
||||||
|
)
|
||||||
print(e)
|
print(e)
|
||||||
raise e
|
raise e
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
@ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list:
|
|||||||
print(f"An unexpected error occurred: {e}")
|
print(f"An unexpected error occurred: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
# for diff in incremental_diffs:
|
||||||
|
# for wikidiffop in json.loads(diff)["diff"][0:5]:
|
||||||
|
# print(wikidiffop)
|
||||||
|
|
||||||
return incremental_diffs
|
return incremental_diffs
|
||||||
|
|
||||||
|
|
||||||
def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
|
class DiffToOperationMap:
|
||||||
d = json.loads(diff)
|
|
||||||
|
|
||||||
# the code below is designed to work in bytes because that's how wikidiff2 indexes
|
def __init__(self, from_text, to_text, diff, tokenizer):
|
||||||
from_text = from_text.encode('utf-8')
|
|
||||||
to_text = to_text.encode('utf-8')
|
|
||||||
|
|
||||||
# convinient function for tokenizing bytes
|
self.diff = diff
|
||||||
def tokenize(bytes):
|
self.tokenizer = tokenizer
|
||||||
return tokenizer.tokenize(bytes.decode('utf-8'))
|
self.diff = json.loads(diff)
|
||||||
|
|
||||||
# Keep track of the last difference we saw in order to notice unaccounted-for
|
# the code below is designed to work in bytes because that's how wikidiff2 indexes
|
||||||
# tokens. Each token at the end of "to" which is skipped for the next diff
|
self.from_bytes = from_text.encode("utf-8")
|
||||||
# must be represented as an "Equal()" segment.
|
self.to_bytes = to_text.encode("utf-8")
|
||||||
from_last_end_bytes = 0
|
|
||||||
to_last_end_bytes = 0
|
|
||||||
|
|
||||||
result = []
|
|
||||||
# DiffState expects differences to be represented in order from the
|
|
||||||
# result's perspective ("to"), not the previous text. Thus, if a line
|
|
||||||
# is moved earlier then its insertion should appear before its deletion.
|
|
||||||
# As a rule of thumb, the "to" segments should be non-overlapping and
|
|
||||||
# strictly increasing, while the "from" segments should merely be
|
|
||||||
# non-overlapping.
|
|
||||||
|
|
||||||
# wikidiff2 appears to follow this same convention, but this behavior
|
|
||||||
# is not documented.
|
|
||||||
|
|
||||||
# Note that, confusingly for Insert operations only the "to" indexes matter
|
self.from_last_end_bytes = 0
|
||||||
# and for the Delete and Equal operations only the "from" indexes matter.
|
self.from_last_to_bytes = 0
|
||||||
# This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
|
self.n_from_start_tokens = 0
|
||||||
|
self.n_to_start_tokens = 0
|
||||||
|
self.last_to_start_line = 0
|
||||||
|
self.last_from_start_line = 0
|
||||||
|
self.from_last_end_bytes = 0
|
||||||
|
self.to_last_end_bytes = 0
|
||||||
|
|
||||||
|
def tokenize(self, bytes):
|
||||||
|
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
||||||
|
|
||||||
parmove_from_dict = {} # lookup move diffs based on moveinfo id.
|
def to_operations(self):
|
||||||
parmove_to_dict = {}
|
parmove_from_dict = {} # lookup move diffs based on moveinfo id.
|
||||||
|
parmove_to_dict = {}
|
||||||
for entry in d['diff']:
|
for entry in self.diff["diff"]:
|
||||||
linebytes = entry['text'].encode('utf-8')
|
offset = entry['offset']
|
||||||
from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
|
linebytes = entry["text"].encode("utf-8")
|
||||||
to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
|
|
||||||
|
|
||||||
from_start_tokens = len(tokenize(from_text[:from_start_line]))
|
# ignore empty diffs. They don't have any tokens
|
||||||
to_start_tokens = len(tokenize(to_text[:to_start_line]))
|
if len(linebytes) == 0:
|
||||||
# These constant calls to tokenizer.tokenize can definitely be optimized
|
continue
|
||||||
# as tokenization is currently a bottleneck. Ideally tokenization would
|
# this is the first byte of the line in the 'from' revision.
|
||||||
# happen incrementally where possible, or somehow be cached, but this
|
from_start_line = entry["offset"]["from"]
|
||||||
# would be more complex. N: I think it's okay. CPU is cheap.
|
# this is the first byte of the line in the 'to' revision.
|
||||||
|
to_start_line = entry["offset"]["to"]
|
||||||
|
|
||||||
if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
|
if entry["type"] == 0:
|
||||||
|
yield from self.doEqual(linebytes, offset)
|
||||||
line_tokens = len(tokenize(linebytes))
|
|
||||||
from_end_tokens = from_start_tokens + line_tokens
|
|
||||||
to_end_tokens = to_start_tokens + line_tokens
|
|
||||||
|
|
||||||
result.append(Equal(from_start_tokens, from_end_tokens,
|
|
||||||
to_start_tokens, to_end_tokens))
|
|
||||||
|
|
||||||
# we need to keep track of the to and from last end bytes
|
|
||||||
from_last_end_bytes += len(linebytes)
|
|
||||||
to_last_end_bytes += len(linebytes)
|
|
||||||
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# These do not appear to be generated by wikidiff2, and so must be
|
|
||||||
# inferred.
|
|
||||||
equal_tokens = to_start_tokens - to_last_end_bytes
|
|
||||||
# If we notice that the next non-zero segment (which must be a
|
|
||||||
# change, given that its type is non-zero), begins after the end
|
|
||||||
# of the previous segment, we must add an Equal segment.
|
|
||||||
# TODO: While the "to" token ranges are correct,
|
|
||||||
# the "from"
|
|
||||||
# ranges are likely not, particularly in histories with paragraph
|
|
||||||
# moves. they can be corrected.
|
|
||||||
if equal_tokens > 0:
|
|
||||||
# only the 'from' indexes matter
|
|
||||||
result.append(Equal(from_last_end_bytes, from_start_line,
|
|
||||||
to_last_end_bytes, to_start_line))
|
|
||||||
|
|
||||||
if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
|
|
||||||
line_tokens = len(tokenize(linebytes))
|
|
||||||
to_end_tokens = to_start_tokens + line_tokens
|
|
||||||
|
|
||||||
result.append(Insert(from_start_tokens, from_start_tokens,
|
|
||||||
to_start_tokens, to_end_tokens,
|
|
||||||
))
|
|
||||||
|
|
||||||
# We have now used more of the "to" tokens.
|
|
||||||
to_start_end = to_end_tokens
|
|
||||||
|
|
||||||
elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
|
# a line included in the 'to' revision, but not in the 'from' revision
|
||||||
line_tokens = len(tokenize(linebytes))
|
elif entry["type"] == 1:
|
||||||
from_end_tokens = from_start_tokens + line_tokens
|
yield from self.doInsert(linebytes, offset)
|
||||||
|
|
||||||
result.append(Delete(from_start_tokens, from_end_tokens,
|
# a line included in the 'from' revision, but not in the 'to' revision
|
||||||
to_start_tokens, to_start_tokens,
|
elif entry["type"] == 2:
|
||||||
))
|
yield from self.doDelete(linebytes, offset)
|
||||||
|
|
||||||
# We have now used more of the "from" tokens.
|
elif entry["type"] == 3:
|
||||||
from_last_end_bytes = from_end_tokens
|
yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset)
|
||||||
|
|
||||||
elif entry['type'] == 3:
|
elif entry["type"] == 4:
|
||||||
# The text field is an overlapping mix of both the from and to,
|
parmove_from_dict["moveInfo"]["id"] = diff
|
||||||
# so we need to handle it highlight-by-highlight.
|
|
||||||
# there can be gaps between highlight segments.
|
|
||||||
# for instance, if a word is deleted from the middle of a line.
|
|
||||||
# we need to track that.
|
|
||||||
highlight_last_end = 0
|
|
||||||
|
|
||||||
# note that diffs are token-level, but the indexes are byte-level
|
elif entry["type"] == 5:
|
||||||
for highlightRange in entry['highlightRanges']:
|
# for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
|
||||||
if highlightRange['start'] > highlight_last_end:
|
parmove_to_dict["moveInfo"]["id"] = diff
|
||||||
|
# for deletions and equality report the token indexes from the 'from' revision.
|
||||||
|
|
||||||
equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
|
else:
|
||||||
equal_tokens = len(tokenize(equal_bytes))
|
# The 'type' isn't one of the known
|
||||||
from_end_tokens = from_start_tokens + equal_tokens
|
raise ValueError(d)
|
||||||
to_end_tokens = to_end_tokens + equal_tokens
|
|
||||||
result.append(Equal(from_start_tokens, from_end_tokens,
|
|
||||||
to_start_tokens, to_end_tokens
|
|
||||||
))
|
|
||||||
|
|
||||||
from_start_tokens = from_end_tokens
|
|
||||||
to_start_tokens = to_end_tokens
|
# mwpersistence expects differences to be represented in order from the
|
||||||
|
# result's perspective ("to"), not the previous text. Thus, if a line
|
||||||
rangeStart = highlightRange['start']
|
# is moved earlier then its insertion should appear before its deletion.
|
||||||
rangeEnd = rangeStart + highlightRange['length']
|
# As a rule of thumb, the "to" segments should be non-overlapping and
|
||||||
range_bytes = linebytes[rangeStart:rangeEnd]
|
# strictly increasing, while the "from" segments should merely be
|
||||||
range_tokens = len(tokenize(range_bytes))
|
# non-overlapping.
|
||||||
if highlightRange['type'] == 0:
|
|
||||||
# Insertion
|
|
||||||
to_end_tokens = to_start_tokens + range_tokens
|
|
||||||
result.append(Insert(from_start_tokens, from_end_tokens,
|
|
||||||
to_start_tokens, to_end_tokens))
|
|
||||||
|
|
||||||
to_start_tokens = to_end_tokens
|
# now we go through the parmoves
|
||||||
elif highlightRange['type'] == 1:
|
for id, from_diff in parmove_from_dict.items():
|
||||||
# Deletion
|
to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]]
|
||||||
from_end_tokens = from_start_tokens + range_tokens
|
|
||||||
result.append(Delete(from_start_tokens, from_end_tokens,
|
|
||||||
to_start_tokens, to_end_tokens))
|
|
||||||
from_start_tokens = from_end_tokens
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise Exception(entry)
|
|
||||||
|
|
||||||
highlight_last_end = highlightRange['start'] + highlightRange['length']
|
|
||||||
|
|
||||||
elif entry['type'] == 4:
|
def doEqual(self, equal_bytes, offset):
|
||||||
|
tokens = self.tokenize(equal_bytes)
|
||||||
|
n_tokens = len(tokens)
|
||||||
|
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||||
|
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||||
|
yield (
|
||||||
|
Equal(
|
||||||
|
self.n_from_start_tokens,
|
||||||
|
self.n_from_end_tokens,
|
||||||
|
self.n_to_start_tokens,
|
||||||
|
self.n_to_end_tokens,
|
||||||
|
),
|
||||||
|
tokens,
|
||||||
|
tokens,
|
||||||
|
)
|
||||||
|
# we need to keep track of the to and from last end bytes
|
||||||
|
self.from_last_end_bytes = offset["from"] + len(equal_bytes)
|
||||||
|
self.to_last_end_bytes = offset["to"] + len(equal_bytes)
|
||||||
|
self.n_from_start_tokens += n_tokens
|
||||||
|
self.n_to_start_tokens += n_tokens
|
||||||
|
|
||||||
parmove_from_dict['moveInfo']['id'] = diff
|
|
||||||
|
|
||||||
elif entry['type'] == 5:
|
def doInsert(self, insert_bytes, offset):
|
||||||
|
tokens = self.tokenize(insert_bytes)
|
||||||
|
n_tokens = len(tokens)
|
||||||
|
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||||
|
yield (
|
||||||
|
Insert(
|
||||||
|
self.n_from_start_tokens,
|
||||||
|
self.n_from_start_tokens,
|
||||||
|
self.n_to_start_tokens,
|
||||||
|
self.n_to_end_tokens,
|
||||||
|
),
|
||||||
|
[],
|
||||||
|
tokens,
|
||||||
|
)
|
||||||
|
# We have now used more of the "to" tokens.
|
||||||
|
self.n_to_start_tokens += n_tokens
|
||||||
|
self.to_last_end_bytes = offset["to"] + len(insert_bytes)
|
||||||
|
|
||||||
parmove_to_dict['moveInfo']['id'] = diff
|
def doDelete(self, delete_bytes, offset):
|
||||||
# for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
|
tokens = self.tokenize(delete_bytes)
|
||||||
# for deletions and equality report the token indexes from the 'from' revision.
|
n_tokens = len(tokens)
|
||||||
else:
|
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||||
# The 'type' isn't one of the known
|
yield (
|
||||||
raise ValueError(d)
|
Delete(
|
||||||
|
self.n_from_start_tokens,
|
||||||
|
self.n_from_end_tokens,
|
||||||
|
self.n_to_start_tokens,
|
||||||
|
self.n_to_start_tokens,
|
||||||
|
),
|
||||||
|
tokens,
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
# We have now used more of the "from" tokens.
|
||||||
|
self.n_from_start_tokens += n_tokens
|
||||||
|
self.from_last_end_bytes = offset["from"] + len(delete_bytes)
|
||||||
|
|
||||||
# now we go through the parmoves
|
def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
|
||||||
for id, from_diff in parmove_from_dict.items():
|
# The text field is an overlapping mix of both the from and to,
|
||||||
to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
|
# so we need to handle it highlight-by-highlight.
|
||||||
### TODO calculate the correct token indexes.
|
# there can be gaps between highlight segments.
|
||||||
|
# for instance, if a word is deleted from the middle of a line.
|
||||||
|
# we need to track that.
|
||||||
|
highlight_end = 0
|
||||||
|
highlight_offset = offset
|
||||||
|
# note that diffs are token-level, but the indexes are byte-level
|
||||||
|
|
||||||
# TODO: Handle trailing tokens
|
for highlightRange in highlightRanges:
|
||||||
|
highlight_start = highlightRange["start"]
|
||||||
|
# equal bytes in between highlights
|
||||||
|
if highlight_start > highlight_end:
|
||||||
|
|
||||||
|
equal_bytes = highlight_bytes[
|
||||||
|
highlight_end : highlight_start
|
||||||
|
]
|
||||||
|
n_equal_bytes = len(equal_bytes)
|
||||||
|
yield from self.doEqual(equal_bytes, highlight_offset)
|
||||||
|
highlight_offset['from'] += n_equal_bytes
|
||||||
|
highlight_offset['to'] += n_equal_bytes
|
||||||
|
|
||||||
|
# handle highlighted insert / delete
|
||||||
|
highlight_end = highlight_start + highlightRange["length"]
|
||||||
|
range_bytes = highlight_bytes[highlight_start:highlight_end]
|
||||||
|
n_range_bytes = len(range_bytes)
|
||||||
|
if highlightRange["type"] == 0:
|
||||||
|
yield from self.doInsert(range_bytes, highlight_offset)
|
||||||
|
highlight_offset['to'] += n_range_bytes
|
||||||
|
elif highlightRange["type"] == 1:
|
||||||
|
yield from self.doDelete(range_bytes, highlight_offset)
|
||||||
|
highlight_offset['from'] += n_range_bytes
|
||||||
|
else:
|
||||||
|
raise Exception(entry)
|
||||||
|
|
||||||
|
# handle the rest of the line which is equal
|
||||||
|
if highlight_end < len(highlight_bytes):
|
||||||
|
range_bytes = highlight_bytes[highlight_end:]
|
||||||
|
yield from self.doEqual(range_bytes, highlight_offset)
|
||||||
|
|
||||||
# raise Exception(result)
|
|
||||||
return result
|
|
||||||
|
|
||||||
class WikiDiffMatcher:
|
class WikiDiffMatcher:
|
||||||
def __init__(self,
|
def __init__(
|
||||||
url: str,
|
self,
|
||||||
texts: list[str],
|
texts: list[str] = None,
|
||||||
tokenizer: RegexTokenizer = None,
|
tokenizer: Optional[RegexTokenizer] = None,
|
||||||
):
|
url: Optional[str] = "http://127.0.0.1:8000",
|
||||||
|
):
|
||||||
# Pre-compute diffs to reduce traffic overhead.
|
# Pre-compute diffs to reduce traffic overhead.
|
||||||
self.diffs = compute_diffs(url, texts)
|
self.diffs = compute_diffs(url, texts)
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
|
|
||||||
class Processor(DiffEngine.Processor):
|
class Processor(DiffEngine.Processor):
|
||||||
def __init__(self,
|
def __init__(self, texts, tokenizer=None):
|
||||||
diffs,
|
self.diffs = iter(texts)
|
||||||
tokenizer=None
|
|
||||||
):
|
|
||||||
self.diffs = iter(diffs)
|
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
self.last_tokens = []
|
self.last_tokens = []
|
||||||
self.previous_text = ""
|
self.previous_text = ""
|
||||||
@ -229,28 +246,27 @@ class WikiDiffMatcher:
|
|||||||
self.last_tokens = last_tokens
|
self.last_tokens = last_tokens
|
||||||
|
|
||||||
def process(self, text, token_class=None):
|
def process(self, text, token_class=None):
|
||||||
# IDEs will report the method signature as incorrect, but this is
|
|
||||||
# expected. The DiffEngine.Processor class must be inherited from,
|
|
||||||
# and its process definition incorrectly excludes a "self" argument.
|
|
||||||
|
|
||||||
# The diff has already been computed, but we need to incrementally
|
# The diff has already been computed, but we need to incrementally
|
||||||
# retrieve it to recreate the behavior DiffState expects.
|
# retrieve it to recreate the behavior DiffState expects.
|
||||||
diff = next(self.diffs)
|
diff = next(self.diffs)
|
||||||
|
diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer)
|
||||||
|
(
|
||||||
|
operations,
|
||||||
|
aseq,
|
||||||
|
bseq,
|
||||||
|
) = list(
|
||||||
|
zip(*diffToOperationsMapper.to_operations())
|
||||||
|
)
|
||||||
|
|
||||||
tokens = self.tokenizer.tokenize(text, token_class=token_class)
|
self.last_tokens = list(chain.from_iterable(aseq))
|
||||||
operations = to_operations(self.previous_text, text, diff, self.tokenizer)
|
tokens = list(chain.from_iterable(bseq))
|
||||||
|
|
||||||
a = self.last_tokens
|
|
||||||
b = tokens
|
|
||||||
self.last_tokens = tokens
|
|
||||||
self.previous_text = text
|
self.previous_text = text
|
||||||
|
|
||||||
return operations, a, b
|
return operations, self.last_tokens, tokens
|
||||||
|
|
||||||
def processor(self, *args, **kwargs):
|
def processor(self, *args, **kwargs):
|
||||||
return self.Processor(self.diffs, self.tokenizer)
|
return self.Processor(self.diffs, self.tokenizer)
|
||||||
|
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
# DiffState checks for this method even though it is not called.
|
# DiffState checks for this method even though it is not called.
|
||||||
raise Exception("Unnecessary implementation")
|
raise Exception("Unnecessary implementation")
|
||||||
|
@ -17,7 +17,7 @@ $data = json_decode($rawData, true);
|
|||||||
$previous = '';
|
$previous = '';
|
||||||
$result = [];
|
$result = [];
|
||||||
foreach ($data as $i => $value) {
|
foreach ($data as $i => $value) {
|
||||||
$result[] = wikidiff2_inline_json_diff($previous, $value, 0);
|
$result[] = wikidiff2_inline_json_diff($previous, $value, 5000000);
|
||||||
$previous = $value;
|
$previous = $value;
|
||||||
}
|
}
|
||||||
|
|
9
wikiq
9
wikiq
@ -140,7 +140,6 @@ The pattern can include capture groups. If it does then each capture group will
|
|||||||
If the pattern does not include a capture group, then only one output column will result.
|
If the pattern does not include a capture group, then only one output column will result.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class RegexPair(object):
|
class RegexPair(object):
|
||||||
def __init__(self, pattern, label):
|
def __init__(self, pattern, label):
|
||||||
self.pattern = re.compile(pattern)
|
self.pattern = re.compile(pattern)
|
||||||
@ -219,7 +218,7 @@ class WikiqParser:
|
|||||||
revert_radius: int = 15,
|
revert_radius: int = 15,
|
||||||
output_parquet: bool = True,
|
output_parquet: bool = True,
|
||||||
parquet_buffer_size: int = 2000,
|
parquet_buffer_size: int = 2000,
|
||||||
wikidiff_url: str = "",
|
wikidiff_url: str = "http://127.0.0.1:8000",
|
||||||
):
|
):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -450,9 +449,9 @@ class WikiqParser:
|
|||||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
||||||
revert_radius=PERSISTENCE_RADIUS)
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
elif self.persist == PersistMethod.wikidiff:
|
elif self.persist == PersistMethod.wikidiff:
|
||||||
state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
|
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
|
||||||
revision_texts,
|
tokenizer=wikitext_split,
|
||||||
tokenizer=wikitext_split),
|
self.wikidiff_url),
|
||||||
revert_radius=PERSISTENCE_RADIUS)
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
else:
|
else:
|
||||||
from mw.lib import persistence
|
from mw.lib import persistence
|
||||||
|
Loading…
Reference in New Issue
Block a user