1
0
pywikidiff2/test/benchmark_wd2.py
Nathan TeBlunthuis e950754c4a bugfix.
2025-08-07 17:13:14 -07:00

43 lines
1.4 KiB
Python

import pyarrow.parquet as pq
from itertools import groupby
from operator import itemgetter
import sys
from typing import Generator, Tuple, List, Any, Dict
import pywikidiff2
differ = pywikidiff2.pywikidiff2(num_context_lines=10)
def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
yield from batch.to_pylist()
def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, List[str]], None, None]:
try:
pq_file = pq.ParquetFile(file_path)
row_stream = _iter_row_dicts(pq_file)
for article_id, group_iterator in groupby(row_stream,
key=itemgetter('articleid')):
texts = [row['text'] for row in group_iterator]
yield article_id, texts
except FileNotFoundError:
print(f"Error: The file at '{file_path}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
p = 0
i = 0
with open('/dev/null', 'w') as of:
for _, texts in stream_text_by_article_streaming(sys.argv[1]):
p = p + 1
i = i + 1
last_text = ""
for text in texts:
result = differ.inline_json_diff(last_text, text)
print(result, file=of)
last_text = text
print(f"{p} pages")
print(f"{i} revisions")