add memray for debugging memory usage.
This commit is contained in:
parent
6d03cac28d
commit
d20075b323
@ -7,6 +7,7 @@ requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
"deltas>=0.7.0",
|
||||
"mediawiki-utilities>=0.4.18",
|
||||
"more-itertools>=10.7.0",
|
||||
"mwpersistence>=0.2.4",
|
||||
"mwreverts>=0.1.5",
|
||||
"mwtypes>=0.4.0",
|
||||
@ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ipython>=8.18.1",
|
||||
"memray>=1.17.2",
|
||||
"pandas>=2.1.0",
|
||||
"pytest>=8.4.1",
|
||||
"pytest-asyncio>=1.0.0",
|
||||
|
@ -3,7 +3,6 @@
|
||||
# original wikiq headers are: title articleid revid date_time anon
|
||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||
# additions_size deletions_size
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os.path
|
||||
@ -227,7 +226,7 @@ class WikiqParser:
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
batch_size: int = 100,
|
||||
batch_size: int = 1024,
|
||||
partition_namespaces: bool = False,
|
||||
):
|
||||
"""
|
||||
@ -871,9 +870,9 @@ def main():
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
dest="batch_size",
|
||||
default=100,
|
||||
default=16000,
|
||||
type=int,
|
||||
help="How many revisions to process in each batch",
|
||||
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
Loading…
Reference in New Issue
Block a user