add memray for debugging memory usage.
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
# original wikiq headers are: title articleid revid date_time anon
|
||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||
# additions_size deletions_size
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os.path
|
||||
@@ -227,7 +226,7 @@ class WikiqParser:
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
batch_size: int = 100,
|
||||
batch_size: int = 1024,
|
||||
partition_namespaces: bool = False,
|
||||
):
|
||||
"""
|
||||
@@ -871,9 +870,9 @@ def main():
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
dest="batch_size",
|
||||
default=100,
|
||||
default=16000,
|
||||
type=int,
|
||||
help="How many revisions to process in each batch",
|
||||
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
Reference in New Issue
Block a user