diff --git a/pyproject.toml b/pyproject.toml index 81cfb0e..6b0cb9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,8 @@ readme = "README.md" requires-python = ">=3.9" dependencies = [] authors = [{ name="Nathan TeBlunthuis", email="nathanteblunthuis@gmail.com"}] + +[dependency-groups] +dev = [ + "pyarrow>=21.0.0", +] diff --git a/test/benchmark_wd2.py b/test/benchmark_wd2.py new file mode 100644 index 0000000..fbc1bf5 --- /dev/null +++ b/test/benchmark_wd2.py @@ -0,0 +1,40 @@ +import pyarrow.parquet as pq +from itertools import groupby +from operator import itemgetter +import sys +from typing import Generator, Tuple, List, Any, Dict +import pywikidiff2 + +differ = pywikidiff2.pywikidiff2() + +def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]: + for batch in parquet_file.iter_batches(columns=['articleid', 'text']): + yield from batch.to_pylist() + + +def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, List[str]], None, None]: + try: + pq_file = pq.ParquetFile(file_path) + row_stream = _iter_row_dicts(pq_file) + for article_id, group_iterator in groupby(row_stream, + key=itemgetter('articleid')): + texts = [row['text'] for row in group_iterator] + yield article_id, texts + + except FileNotFoundError: + print(f"Error: The file at '{file_path}' was not found.") + except Exception as e: + print(f"An error occurred: {e}") + + +last_text = "" +last_article_id = None + + +with open('/dev/null', 'w') as of: + for article_id, text in stream_text_by_article_streaming(sys.argv[1]): + if article_id != last_article_id: + last_article_id = article_id + last_text = "" + result = differ.inline_json_diff(last_text, text) + print(result, file=of)