add memray for debugging memory usage.
This commit is contained in:
parent
6d03cac28d
commit
d20075b323
@ -7,6 +7,7 @@ requires-python = ">=3.9"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deltas>=0.7.0",
|
"deltas>=0.7.0",
|
||||||
"mediawiki-utilities>=0.4.18",
|
"mediawiki-utilities>=0.4.18",
|
||||||
|
"more-itertools>=10.7.0",
|
||||||
"mwpersistence>=0.2.4",
|
"mwpersistence>=0.2.4",
|
||||||
"mwreverts>=0.1.5",
|
"mwreverts>=0.1.5",
|
||||||
"mwtypes>=0.4.0",
|
"mwtypes>=0.4.0",
|
||||||
@ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi
|
|||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
"ipython>=8.18.1",
|
"ipython>=8.18.1",
|
||||||
|
"memray>=1.17.2",
|
||||||
"pandas>=2.1.0",
|
"pandas>=2.1.0",
|
||||||
"pytest>=8.4.1",
|
"pytest>=8.4.1",
|
||||||
"pytest-asyncio>=1.0.0",
|
"pytest-asyncio>=1.0.0",
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
# original wikiq headers are: title articleid revid date_time anon
|
# original wikiq headers are: title articleid revid date_time anon
|
||||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||||
# additions_size deletions_size
|
# additions_size deletions_size
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
@ -227,7 +226,7 @@ class WikiqParser:
|
|||||||
namespaces: Union[list[int], None] = None,
|
namespaces: Union[list[int], None] = None,
|
||||||
revert_radius: int = 15,
|
revert_radius: int = 15,
|
||||||
output_parquet: bool = True,
|
output_parquet: bool = True,
|
||||||
batch_size: int = 100,
|
batch_size: int = 1024,
|
||||||
partition_namespaces: bool = False,
|
partition_namespaces: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -871,9 +870,9 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch-size",
|
"--batch-size",
|
||||||
dest="batch_size",
|
dest="batch_size",
|
||||||
default=100,
|
default=16000,
|
||||||
type=int,
|
type=int,
|
||||||
help="How many revisions to process in each batch",
|
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
Loading…
Reference in New Issue
Block a user