diff --git a/pyproject.toml b/pyproject.toml index 82c44cf..d6b8d19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.9" dependencies = [ "deltas>=0.7.0", "mediawiki-utilities>=0.4.18", + "more-itertools>=10.7.0", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", "mwtypes>=0.4.0", @@ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi [dependency-groups] dev = [ "ipython>=8.18.1", + "memray>=1.17.2", "pandas>=2.1.0", "pytest>=8.4.1", "pytest-asyncio>=1.0.0", diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 63a0a70..cf9d961 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -3,7 +3,6 @@ # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size - import argparse import json import os.path @@ -227,7 +226,7 @@ class WikiqParser: namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_parquet: bool = True, - batch_size: int = 100, + batch_size: int = 1024, partition_namespaces: bool = False, ): """ @@ -871,9 +870,9 @@ def main(): parser.add_argument( "--batch-size", dest="batch_size", - default=100, + default=16000, type=int, - help="How many revisions to process in each batch", + help="How many revisions to process in each batch. This ends up being the Parquet row group size", ) args = parser.parse_args()