some work on wiki_diff_matcher.py
This commit is contained in:
parent
bc7f186112
commit
186cb82fb8
@ -4,7 +4,8 @@ import sys
|
|||||||
import requests
|
import requests
|
||||||
from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
|
from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
|
||||||
|
|
||||||
TOKENIZER = tokenizers.text_split
|
TOKENIZER = tokenizers.wikitext_split
|
||||||
|
|
||||||
|
|
||||||
def compute_diffs(url: str, texts: list[str]) -> list:
|
def compute_diffs(url: str, texts: list[str]) -> list:
|
||||||
response = None
|
response = None
|
||||||
@ -36,14 +37,22 @@ def compute_diffs(url: str, texts: list[str]) -> list:
|
|||||||
return incremental_diffs
|
return incremental_diffs
|
||||||
|
|
||||||
|
|
||||||
def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list:
|
def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
|
||||||
d = json.loads(diff)
|
d = json.loads(diff)
|
||||||
|
|
||||||
|
# the code below is designed to work in bytes because that's how wikidiff2 indexes
|
||||||
|
from_text = from_text.encode('utf-8')
|
||||||
|
to_text = to_text.encode('utf-8')
|
||||||
|
|
||||||
|
# convinient function for tokenizing bytes
|
||||||
|
def tokenize(bytes):
|
||||||
|
return tokenizer.tokenize(bytes.decode('utf-8'))
|
||||||
|
|
||||||
# Keep track of the last difference we saw in order to notice unaccounted-for
|
# Keep track of the last difference we saw in order to notice unaccounted-for
|
||||||
# tokens. Each token at the end of "to" which is skipped for the next diff
|
# tokens. Each token at the end of "to" which is skipped for the next diff
|
||||||
# must be represented as an "Equal()" segment.
|
# must be represented as an "Equal()" segment.
|
||||||
from_last_end = 0
|
from_last_end_bytes = 0
|
||||||
to_last_end = 0
|
to_last_end_bytes = 0
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
# DiffState expects differences to be represented in order from the
|
# DiffState expects differences to be represented in order from the
|
||||||
@ -52,56 +61,61 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) ->
|
|||||||
# As a rule of thumb, the "to" segments should be non-overlapping and
|
# As a rule of thumb, the "to" segments should be non-overlapping and
|
||||||
# strictly increasing, while the "from" segments should merely be
|
# strictly increasing, while the "from" segments should merely be
|
||||||
# non-overlapping.
|
# non-overlapping.
|
||||||
#
|
|
||||||
# wikidiff2 appears to follow this same convention, but this behavior
|
# wikidiff2 appears to follow this same convention, but this behavior
|
||||||
# is not documented.
|
# is not documented.
|
||||||
|
|
||||||
for entry in d['diff']:
|
# Note that, confusingly for Insert operations only the "to" indexes matter
|
||||||
from_start_line = entry['offset']['from']
|
# and for the Delete and Equal operations only the "from" indexes matter.
|
||||||
to_start_line = entry['offset']['to']
|
# This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
|
||||||
# Per above, to_start_line appears to be nondecreasing, but
|
|
||||||
# from_start_line may sometimes decrease for detected paragraph moves.
|
|
||||||
|
|
||||||
from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line]))
|
parmove_from_dict = {} # lookup move diffs based on moveinfo id.
|
||||||
to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line]))
|
parmove_to_dict = {}
|
||||||
|
|
||||||
|
for entry in d['diff']:
|
||||||
|
linebytes = entry['text'].encode('utf-8')
|
||||||
|
from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
|
||||||
|
to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
|
||||||
|
|
||||||
|
from_start_tokens = len(tokenize(from_text[:from_start_line]))
|
||||||
|
to_start_tokens = len(tokenize(to_text[:to_start_line]))
|
||||||
# These constant calls to tokenizer.tokenize can definitely be optimized
|
# These constant calls to tokenizer.tokenize can definitely be optimized
|
||||||
# as tokenization is currently a bottleneck. Ideally tokenization would
|
# as tokenization is currently a bottleneck. Ideally tokenization would
|
||||||
# happen incrementally where possible, or somehow be cached, but this
|
# happen incrementally where possible, or somehow be cached, but this
|
||||||
# would be more complex.
|
# would be more complex. N: I think it's okay. CPU is cheap.
|
||||||
|
|
||||||
if entry['type'] == 0:
|
if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
|
||||||
# wikidiff2 doesn't appear to emit diffs of this type, but cover anyway.
|
|
||||||
line_tokens = len(tokenizer.tokenize(entry['text']))
|
line_tokens = len(tokenize(linebytes))
|
||||||
from_end_tokens = from_start_tokens + line_tokens
|
from_end_tokens = from_start_tokens + line_tokens
|
||||||
to_end_tokens = to_start_tokens + line_tokens
|
to_end_tokens = to_start_tokens + line_tokens
|
||||||
|
|
||||||
result.append(Equal(from_start_tokens, from_end_tokens,
|
result.append(Equal(from_start_tokens, from_end_tokens,
|
||||||
to_start_tokens, to_end_tokens))
|
to_start_tokens, to_end_tokens))
|
||||||
|
|
||||||
from_last_end = from_end_tokens
|
# we need to keep track of the to and from last end bytes
|
||||||
to_last_end = to_end_tokens
|
from_last_end_bytes += len(linebytes)
|
||||||
|
to_last_end_bytes += len(linebytes)
|
||||||
|
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
# These do not appear to be generated by wikidiff2, and so must be
|
# These do not appear to be generated by wikidiff2, and so must be
|
||||||
# inferred.
|
# inferred.
|
||||||
equal_tokens = to_start_tokens - to_last_end
|
equal_tokens = to_start_tokens - to_last_end_bytes
|
||||||
# If we notice that the next non-zero segment (which must be a
|
# If we notice that the next non-zero segment (which must be a
|
||||||
# change, given that its type is non-zero), begins after the end
|
# change, given that its type is non-zero), begins after the end
|
||||||
# of the previous segment, we must add an Equal segment.
|
# of the previous segment, we must add an Equal segment.
|
||||||
# TODO: While the "to" token ranges are correct, the "from"
|
# TODO: While the "to" token ranges are correct,
|
||||||
|
# the "from"
|
||||||
# ranges are likely not, particularly in histories with paragraph
|
# ranges are likely not, particularly in histories with paragraph
|
||||||
# moves.
|
# moves. they can be corrected.
|
||||||
if equal_tokens > 0:
|
if equal_tokens > 0:
|
||||||
result.append(Equal(from_last_end, from_start_line,
|
# only the 'from' indexes matter
|
||||||
to_last_end, to_start_line))
|
result.append(Equal(from_last_end_bytes, from_start_line,
|
||||||
|
to_last_end_bytes, to_start_line))
|
||||||
|
|
||||||
|
if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
|
||||||
if entry['type'] == 1 or entry['type'] == 4:
|
line_tokens = len(tokenize(linebytes))
|
||||||
# TODO: Separate out type 4 to recognize this is the insertion
|
|
||||||
# part of a paragraph move. Note that for paragraph moves
|
|
||||||
# the text is not necessarily identical, just similar.
|
|
||||||
line_tokens = len(tokenizer.tokenize(entry['text']))
|
|
||||||
to_end_tokens = to_start_tokens + line_tokens
|
to_end_tokens = to_start_tokens + line_tokens
|
||||||
|
|
||||||
result.append(Insert(from_start_tokens, from_start_tokens,
|
result.append(Insert(from_start_tokens, from_start_tokens,
|
||||||
@ -109,64 +123,83 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) ->
|
|||||||
))
|
))
|
||||||
|
|
||||||
# We have now used more of the "to" tokens.
|
# We have now used more of the "to" tokens.
|
||||||
to_last_end = to_end_tokens
|
to_start_end = to_end_tokens
|
||||||
elif entry['type'] == 2 or entry['type'] == 5:
|
|
||||||
# TODO: Separate out type 5 to recognize this is the deletion
|
elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
|
||||||
# part of a paragraph move. Note that for paragraph moves
|
line_tokens = len(tokenize(linebytes))
|
||||||
# the text is not necessarily identical, just similar.
|
|
||||||
line_tokens = len(tokenizer.tokenize(entry['text']))
|
|
||||||
from_end_tokens = from_start_tokens + line_tokens
|
from_end_tokens = from_start_tokens + line_tokens
|
||||||
|
|
||||||
result.append(Delete(from_start_tokens, from_end_tokens,
|
result.append(Delete(from_start_tokens, from_end_tokens,
|
||||||
to_start_tokens, to_start_tokens,
|
to_start_tokens, to_start_tokens,
|
||||||
))
|
))
|
||||||
|
|
||||||
# We have not used more of the "from" tokens.
|
# We have now used more of the "from" tokens.
|
||||||
from_last_end = from_end_tokens
|
from_last_end_bytes = from_end_tokens
|
||||||
|
|
||||||
elif entry['type'] == 3:
|
elif entry['type'] == 3:
|
||||||
# The text field is an overlapping mix of both the previous and next
|
# The text field is an overlapping mix of both the from and to,
|
||||||
# lines, and so we can't directly tokenize it.
|
# so we need to handle it highlight-by-highlight.
|
||||||
|
# there can be gaps between highlight segments.
|
||||||
text = entry['text']
|
|
||||||
|
|
||||||
last_end = 0
|
|
||||||
previous_line = ""
|
|
||||||
next_line = ""
|
|
||||||
|
|
||||||
# A line will have one or more highlightRanges.
|
|
||||||
# It is not guaranteed that insertions/deletions are matched,
|
|
||||||
# for instance, if a word is deleted from the middle of a line.
|
# for instance, if a word is deleted from the middle of a line.
|
||||||
|
# we need to track that.
|
||||||
|
highlight_last_end = 0
|
||||||
|
|
||||||
|
# note that diffs are token-level, but the indexes are byte-level
|
||||||
for highlightRange in entry['highlightRanges']:
|
for highlightRange in entry['highlightRanges']:
|
||||||
if highlightRange['start'] > last_end:
|
if highlightRange['start'] > highlight_last_end:
|
||||||
previous_line += text[last_end:highlightRange['start']]
|
|
||||||
next_line += text[last_end:highlightRange['start']]
|
equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
|
||||||
# Add an Equal segment.
|
equal_tokens = len(tokenize(equal_bytes))
|
||||||
|
from_end_tokens = from_start_tokens + equal_tokens
|
||||||
|
to_end_tokens = to_end_tokens + equal_tokens
|
||||||
|
result.append(Equal(from_start_tokens, from_end_tokens,
|
||||||
|
to_start_tokens, to_end_tokens
|
||||||
|
))
|
||||||
|
|
||||||
|
from_start_tokens = from_end_tokens
|
||||||
|
to_start_tokens = to_end_tokens
|
||||||
|
|
||||||
rangeStart = highlightRange['start']
|
rangeStart = highlightRange['start']
|
||||||
rangeEnd = rangeStart + highlightRange['length']
|
rangeEnd = rangeStart + highlightRange['length']
|
||||||
|
range_bytes = linebytes[rangeStart:rangeEnd]
|
||||||
|
range_tokens = len(tokenize(range_bytes))
|
||||||
if highlightRange['type'] == 0:
|
if highlightRange['type'] == 0:
|
||||||
# Insertion
|
# Insertion
|
||||||
next_line += text[rangeStart:rangeEnd]
|
to_end_tokens = to_start_tokens + range_tokens
|
||||||
|
result.append(Insert(from_start_tokens, from_end_tokens,
|
||||||
|
to_start_tokens, to_end_tokens))
|
||||||
|
|
||||||
# Add an Insert segment.
|
to_start_tokens = to_end_tokens
|
||||||
elif highlightRange['type'] == 1:
|
elif highlightRange['type'] == 1:
|
||||||
# Deletion
|
# Deletion
|
||||||
previous_line += text[rangeStart:rangeEnd]
|
from_end_tokens = from_start_tokens + range_tokens
|
||||||
|
result.append(Delete(from_start_tokens, from_end_tokens,
|
||||||
|
to_start_tokens, to_end_tokens))
|
||||||
|
from_start_tokens = from_end_tokens
|
||||||
|
|
||||||
# Add a Delete segment.
|
|
||||||
else:
|
else:
|
||||||
raise Exception(entry)
|
raise Exception(entry)
|
||||||
|
|
||||||
from_tokens = len(tokenizer.tokenize(previous_line))
|
highlight_last_end = highlightRange['start'] + highlightRange['length']
|
||||||
to_tokens = len(tokenizer.tokenize(next_line))
|
|
||||||
|
|
||||||
from_start_tokens += from_tokens
|
elif entry['type'] == 4:
|
||||||
to_start_tokens += to_tokens
|
|
||||||
|
parmove_from_dict['moveInfo']['id'] = diff
|
||||||
|
|
||||||
|
elif entry['type'] == 5:
|
||||||
|
|
||||||
|
parmove_to_dict['moveInfo']['id'] = diff
|
||||||
|
# for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
|
||||||
|
# for deletions and equality report the token indexes from the 'from' revision.
|
||||||
else:
|
else:
|
||||||
# The 'type' isn't one of the known
|
# The 'type' isn't one of the known
|
||||||
raise ValueError(d)
|
raise ValueError(d)
|
||||||
|
|
||||||
|
# now we go through the parmoves
|
||||||
|
for id, from_diff in parmove_from_dict.items():
|
||||||
|
to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
|
||||||
|
### TODO calculate the correct token indexes.
|
||||||
|
|
||||||
# TODO: Handle trailing tokens
|
# TODO: Handle trailing tokens
|
||||||
|
|
||||||
# raise Exception(result)
|
# raise Exception(result)
|
||||||
|
Loading…
Reference in New Issue
Block a user