increase cache size.

This commit is contained in:
Nathan TeBlunthuis 2025-08-03 09:24:35 -07:00
parent 2f853a879d
commit 1f08c01cf1
2 changed files with 12 additions and 7 deletions

View File

@ -422,6 +422,7 @@ class WikiqParser:
} }
for path in ns_paths.values(): for path in ns_paths.values():
Path(path).parent.mkdir(exist_ok=True, parents=True) Path(path).parent.mkdir(exist_ok=True, parents=True)
output_buffers = {ns: [] for ns, path in ns_paths.values()}
pq_writers = { pq_writers = {
ns: pq.ParquetWriter( ns: pq.ParquetWriter(
path, schema, flavor="spark", sorting_columns=sorting_cols path, schema, flavor="spark", sorting_columns=sorting_cols
@ -430,6 +431,7 @@ class WikiqParser:
} }
else: else:
output_buffer = []
writer = pacsv.CSVWriter( writer = pacsv.CSVWriter(
self.output_file, self.output_file,
schema, schema,
@ -493,9 +495,9 @@ class WikiqParser:
num_context_lines=1000000, num_context_lines=1000000,
max_word_level_diff_complexity=-1, max_word_level_diff_complexity=-1,
moved_paragraph_detection_cutoff=-1, moved_paragraph_detection_cutoff=-1,
words_cache_capacity=500, words_cache_capacity=10000,
diff_cache_capacity=500, diff_cache_capacity=10000,
stats_cache_capacity=500, stats_cache_capacity=10000,
) )
while not on_last_batch: while not on_last_batch:
@ -677,7 +679,10 @@ class WikiqParser:
del row_buffer["text"] del row_buffer["text"]
if self.partition_namespaces is True: if self.partition_namespaces is True:
output_buffer = output_buffers[page.mwpage.namespace]
writer = pq_writers[page.mwpage.namespace] writer = pq_writers[page.mwpage.namespace]
output_buffer += row_buffer
if(len(output_buffer) >
writer.write(pa.record_batch(row_buffer, schema=schema)) writer.write(pa.record_batch(row_buffer, schema=schema))
gc.collect() gc.collect()
page_count += 1 page_count += 1
@ -875,7 +880,7 @@ def main():
parser.add_argument( parser.add_argument(
"--batch-size", "--batch-size",
dest="batch_size", dest="batch_size",
default=1500, default=16000,
type=int, type=int,
help="How many revisions to process in each batch. This ends up being the Parquet row group size", help="How many revisions to process in each batch. This ends up being the Parquet row group size",
) )

View File

@ -339,9 +339,9 @@ class WikiDiffMatcher:
num_context_lines=1000000, num_context_lines=1000000,
max_word_level_diff_complexity=-1, max_word_level_diff_complexity=-1,
moved_paragraph_detection_cutoff=-1, moved_paragraph_detection_cutoff=-1,
words_cache_capacity=5000, words_cache_capacity=10000,
diff_cache_capacity=5000, diff_cache_capacity=10000,
stats_cache_capacity=5000, stats_cache_capacity=10000,
) )
self.last_diff = None self.last_diff = None