add memray for debugging memory usage.
This commit is contained in:
		
							parent
							
								
									6d03cac28d
								
							
						
					
					
						commit
						d20075b323
					
				| @ -7,6 +7,7 @@ requires-python = ">=3.9" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|     "deltas>=0.7.0", |     "deltas>=0.7.0", | ||||||
|     "mediawiki-utilities>=0.4.18", |     "mediawiki-utilities>=0.4.18", | ||||||
|  |     "more-itertools>=10.7.0", | ||||||
|     "mwpersistence>=0.2.4", |     "mwpersistence>=0.2.4", | ||||||
|     "mwreverts>=0.1.5", |     "mwreverts>=0.1.5", | ||||||
|     "mwtypes>=0.4.0", |     "mwtypes>=0.4.0", | ||||||
| @ -37,6 +38,7 @@ pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidi | |||||||
| [dependency-groups] | [dependency-groups] | ||||||
| dev = [ | dev = [ | ||||||
|     "ipython>=8.18.1", |     "ipython>=8.18.1", | ||||||
|  |     "memray>=1.17.2", | ||||||
|     "pandas>=2.1.0", |     "pandas>=2.1.0", | ||||||
|     "pytest>=8.4.1", |     "pytest>=8.4.1", | ||||||
|     "pytest-asyncio>=1.0.0", |     "pytest-asyncio>=1.0.0", | ||||||
|  | |||||||
| @ -3,7 +3,6 @@ | |||||||
| # original wikiq headers are: title articleid revid date_time anon | # original wikiq headers are: title articleid revid date_time anon | ||||||
| # editor editor_id minor text_size text_entropy text_md5 reversion | # editor editor_id minor text_size text_entropy text_md5 reversion | ||||||
| # additions_size deletions_size | # additions_size deletions_size | ||||||
| 
 |  | ||||||
| import argparse | import argparse | ||||||
| import json | import json | ||||||
| import os.path | import os.path | ||||||
| @ -227,7 +226,7 @@ class WikiqParser: | |||||||
|         namespaces: Union[list[int], None] = None, |         namespaces: Union[list[int], None] = None, | ||||||
|         revert_radius: int = 15, |         revert_radius: int = 15, | ||||||
|         output_parquet: bool = True, |         output_parquet: bool = True, | ||||||
|         batch_size: int = 100, |         batch_size: int = 1024, | ||||||
|         partition_namespaces: bool = False, |         partition_namespaces: bool = False, | ||||||
|     ): |     ): | ||||||
|         """ |         """ | ||||||
| @ -871,9 +870,9 @@ def main(): | |||||||
|     parser.add_argument( |     parser.add_argument( | ||||||
|         "--batch-size", |         "--batch-size", | ||||||
|         dest="batch_size", |         dest="batch_size", | ||||||
|         default=100, |         default=16000, | ||||||
|         type=int, |         type=int, | ||||||
|         help="How many revisions to process in each batch", |         help="How many revisions to process in each batch. This ends up being the Parquet row group size", | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     args = parser.parse_args() |     args = parser.parse_args() | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user