From 77f367d95e778847525cd30af7f4979b3e05ee3f Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Sun, 3 Aug 2025 09:36:42 -0700 Subject: [PATCH] Revert changes related to row-buffering to just "increase cache size." This reverts commit 1f08c01cf13d0d7dfae8480a43ebcdd77e67e8a7. --- src/wikiq/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index ea6bb76..426d43e 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -422,7 +422,6 @@ class WikiqParser: } for path in ns_paths.values(): Path(path).parent.mkdir(exist_ok=True, parents=True) - output_buffers = {ns: [] for ns, path in ns_paths.values()} pq_writers = { ns: pq.ParquetWriter( path, schema, flavor="spark", sorting_columns=sorting_cols @@ -431,7 +430,6 @@ class WikiqParser: } else: - output_buffer = [] writer = pacsv.CSVWriter( self.output_file, schema, @@ -679,10 +677,7 @@ class WikiqParser: del row_buffer["text"] if self.partition_namespaces is True: - output_buffer = output_buffers[page.mwpage.namespace] writer = pq_writers[page.mwpage.namespace] - output_buffer += row_buffer - if(len(output_buffer) > writer.write(pa.record_batch(row_buffer, schema=schema)) gc.collect() page_count += 1 @@ -880,7 +875,7 @@ def main(): parser.add_argument( "--batch-size", dest="batch_size", - default=16000, + default=1500, type=int, help="How many revisions to process in each batch. This ends up being the Parquet row group size", )