add memray for debugging memory usage.

This commit is contained in:
Nathan TeBlunthuis 2025-07-17 15:17:23 -07:00
parent 6d03cac28d
commit d20075b323
2 changed files with 5 additions and 4 deletions

View File

@ -7,6 +7,7 @@ requires-python = ">=3.9"
dependencies = [
"deltas>=0.7.0",
"mediawiki-utilities>=0.4.18",
"more-itertools>=10.7.0",
"mwpersistence>=0.2.4",
"mwreverts>=0.1.5",
"mwtypes>=0.4.0",
@ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi
[dependency-groups]
dev = [
"ipython>=8.18.1",
"memray>=1.17.2",
"pandas>=2.1.0",
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",

View File

@ -3,7 +3,6 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import argparse
import json
import os.path
@ -227,7 +226,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None,
revert_radius: int = 15,
output_parquet: bool = True,
batch_size: int = 100,
batch_size: int = 1024,
partition_namespaces: bool = False,
):
"""
@ -871,9 +870,9 @@ def main():
parser.add_argument(
"--batch-size",
dest="batch_size",
default=100,
default=16000,
type=int,
help="How many revisions to process in each batch",
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
)
args = parser.parse_args()