pywikidiff2/test/benchmark_wd2.py

import pyarrow.parquet as pq
from itertools import groupby
from operator import itemgetter
import sys
from typing import Generator, Tuple, List, Any, Dict
import pywikidiff2

differ = pywikidiff2.pywikidiff2()

def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
    for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
        yield from batch.to_pylist()


def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, List[str]], None, None]:
    try:
        pq_file = pq.ParquetFile(file_path)
        row_stream = _iter_row_dicts(pq_file)
        for article_id, group_iterator in groupby(row_stream,
                                                  key=itemgetter('articleid')):
            texts = [row['text'] for row in group_iterator]
            yield article_id, texts

    except FileNotFoundError:
        print(f"Error: The file at '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


last_text = ""
last_article_id = None


with open('/dev/null', 'w') as of:
    for article_id, text in stream_text_by_article_streaming(sys.argv[1]):
        if article_id != last_article_id:
            last_article_id = article_id
            last_text = ""
        result = differ.inline_json_diff(last_text, text)
        print(result, file=of)