add script for benchmarking wikidiff2.
This commit is contained in:
parent
480a866d2c
commit
a1efecd0e5
@ -10,3 +10,8 @@ readme = "README.md"
|
|||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
dependencies = []
|
dependencies = []
|
||||||
authors = [{ name="Nathan TeBlunthuis", email="nathanteblunthuis@gmail.com"}]
|
authors = [{ name="Nathan TeBlunthuis", email="nathanteblunthuis@gmail.com"}]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"pyarrow>=21.0.0",
|
||||||
|
]
|
||||||
|
40
test/benchmark_wd2.py
Normal file
40
test/benchmark_wd2.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import pyarrow.parquet as pq
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import itemgetter
|
||||||
|
import sys
|
||||||
|
from typing import Generator, Tuple, List, Any, Dict
|
||||||
|
import pywikidiff2
|
||||||
|
|
||||||
|
differ = pywikidiff2.pywikidiff2()
|
||||||
|
|
||||||
|
def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
|
||||||
|
for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
|
||||||
|
yield from batch.to_pylist()
|
||||||
|
|
||||||
|
|
||||||
|
def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, List[str]], None, None]:
|
||||||
|
try:
|
||||||
|
pq_file = pq.ParquetFile(file_path)
|
||||||
|
row_stream = _iter_row_dicts(pq_file)
|
||||||
|
for article_id, group_iterator in groupby(row_stream,
|
||||||
|
key=itemgetter('articleid')):
|
||||||
|
texts = [row['text'] for row in group_iterator]
|
||||||
|
yield article_id, texts
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: The file at '{file_path}' was not found.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
last_text = ""
|
||||||
|
last_article_id = None
|
||||||
|
|
||||||
|
|
||||||
|
with open('/dev/null', 'w') as of:
|
||||||
|
for article_id, text in stream_text_by_article_streaming(sys.argv[1]):
|
||||||
|
if article_id != last_article_id:
|
||||||
|
last_article_id = article_id
|
||||||
|
last_text = ""
|
||||||
|
result = differ.inline_json_diff(last_text, text)
|
||||||
|
print(result, file=of)
|
Loading…
Reference in New Issue
Block a user