set num_context_lines; output counts.
This commit is contained in:
parent
30ec25cb91
commit
7a61fd38d9
@ -5,7 +5,7 @@ import sys
|
|||||||
from typing import Generator, Tuple, List, Any, Dict
|
from typing import Generator, Tuple, List, Any, Dict
|
||||||
import pywikidiff2
|
import pywikidiff2
|
||||||
|
|
||||||
differ = pywikidiff2.pywikidiff2()
|
differ = pywikidiff2.pywikidiff2(num_context_lines=10)
|
||||||
|
|
||||||
def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
|
def _iter_row_dicts(parquet_file: pq.ParquetFile) -> Generator[Dict[str, Any], None, None]:
|
||||||
for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
|
for batch in parquet_file.iter_batches(columns=['articleid', 'text']):
|
||||||
@ -27,12 +27,16 @@ def stream_text_by_article_streaming(file_path: str) -> Generator[Tuple[Any, Lis
|
|||||||
print(f"An error occurred: {e}")
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
p = 0
|
||||||
|
i = 0
|
||||||
with open('/dev/null', 'w') as of:
|
with open('/dev/null', 'w') as of:
|
||||||
|
p = p + 1
|
||||||
for _, texts in stream_text_by_article_streaming(sys.argv[1]):
|
for _, texts in stream_text_by_article_streaming(sys.argv[1]):
|
||||||
|
i = i + 1
|
||||||
last_text = ""
|
last_text = ""
|
||||||
for text in texts:
|
for text in texts:
|
||||||
result = differ.inline_json_diff(last_text, text)
|
result = differ.inline_json_diff(last_text, text)
|
||||||
print(result, file=of)
|
print(result, file=of)
|
||||||
last_text = text
|
last_text = text
|
||||||
|
print(f"{p} pages")
|
||||||
|
print(f"{i} revisions")
|
||||||
|
Loading…
Reference in New Issue
Block a user