import pyarrow.parquet as pq from itertools import groupby from operator import itemgetter import sys from typing import Generator, Tuple, List, Any, Dict import pywikidiff2 differ = pywikidiff2.pywikidiff2() def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]: for batch in parquet_file.iter_batches(columns=['articleid', 'text']): yield from batch.to_pylist() def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, List[str]], None, None]: try: pq_file = pq.ParquetFile(file_path) row_stream = _iter_row_dicts(pq_file) for article_id, group_iterator in groupby(row_stream, key=itemgetter('articleid')): texts = [row['text'] for row in group_iterator] yield article_id, texts except FileNotFoundError: print(f"Error: The file at '{file_path}' was not found.") except Exception as e: print(f"An error occurred: {e}") last_text = "" last_article_id = None with open('/dev/null', 'w') as of: for article_id, text in stream_text_by_article_streaming(sys.argv[1]): if article_id != last_article_id: last_article_id = article_id last_text = "" result = differ.inline_json_diff(last_text, text) print(result, file=of)