1
0

set num_context_lines; output counts.

This commit is contained in:
Nathan TeBlunthuis 2025-08-07 17:05:06 -07:00
parent 30ec25cb91
commit 7a61fd38d9

View File

@ -5,7 +5,7 @@ import sys
from typing import Generator, Tuple, List, Any, Dict from typing import Generator, Tuple, List, Any, Dict
import pywikidiff2 import pywikidiff2
differ = pywikidiff2.pywikidiff2() differ = pywikidiff2.pywikidiff2(num_context_lines=10)
def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]: def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
for batch in parquet_file.iter_batches(columns=['articleid', 'text']): for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
@ -27,12 +27,16 @@ def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, Lis
print(f"An error occurred: {e}") print(f"An error occurred: {e}")
p = 0
i = 0
with open('/dev/null', 'w') as of: with open('/dev/null', 'w') as of:
p = p + 1
for _, texts in stream_text_by_article_streaming(sys.argv[1]): for _, texts in stream_text_by_article_streaming(sys.argv[1]):
i = i + 1
last_text = "" last_text = ""
for text in texts: for text in texts:
result = differ.inline_json_diff(last_text, text) result = differ.inline_json_diff(last_text, text)
print(result, file=of) print(result, file=of)
last_text = text last_text = text
print(f"{p} pages")
print(f"{i} revisions")