add memray for debugging memory usage.

This commit is contained in:
Nathan TeBlunthuis 2025-07-17 15:17:23 -07:00
parent 6d03cac28d
commit d20075b323
2 changed files with 5 additions and 4 deletions

View File

@ -7,6 +7,7 @@ requires-python = ">=3.9"
dependencies = [ dependencies = [
"deltas>=0.7.0", "deltas>=0.7.0",
"mediawiki-utilities>=0.4.18", "mediawiki-utilities>=0.4.18",
"more-itertools>=10.7.0",
"mwpersistence>=0.2.4", "mwpersistence>=0.2.4",
"mwreverts>=0.1.5", "mwreverts>=0.1.5",
"mwtypes>=0.4.0", "mwtypes>=0.4.0",
@ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi
[dependency-groups] [dependency-groups]
dev = [ dev = [
"ipython>=8.18.1", "ipython>=8.18.1",
"memray>=1.17.2",
"pandas>=2.1.0", "pandas>=2.1.0",
"pytest>=8.4.1", "pytest>=8.4.1",
"pytest-asyncio>=1.0.0", "pytest-asyncio>=1.0.0",

View File

@ -3,7 +3,6 @@
# original wikiq headers are: title articleid revid date_time anon # original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion # editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size # additions_size deletions_size
import argparse import argparse
import json import json
import os.path import os.path
@ -227,7 +226,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None, namespaces: Union[list[int], None] = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
batch_size: int = 100, batch_size: int = 1024,
partition_namespaces: bool = False, partition_namespaces: bool = False,
): ):
""" """
@ -871,9 +870,9 @@ def main():
parser.add_argument( parser.add_argument(
"--batch-size", "--batch-size",
dest="batch_size", dest="batch_size",
default=100, default=16000,
type=int, type=int,
help="How many revisions to process in each batch", help="How many revisions to process in each batch. This ends up being the Parquet row group size",
) )
args = parser.parse_args() args = parser.parse_args()