add memray for debugging memory usage.

This commit is contained in:
Nathan TeBlunthuis
2025-07-17 15:17:23 -07:00
parent 6d03cac28d
commit d20075b323
2 changed files with 5 additions and 4 deletions

View File

@@ -3,7 +3,6 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import argparse
import json
import os.path
@@ -227,7 +226,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None,
revert_radius: int = 15,
output_parquet: bool = True,
batch_size: int = 100,
batch_size: int = 1024,
partition_namespaces: bool = False,
):
"""
@@ -871,9 +870,9 @@ def main():
parser.add_argument(
"--batch-size",
dest="batch_size",
default=100,
default=16000,
type=int,
help="How many revisions to process in each batch",
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
)
args = parser.parse_args()