increase cache size.
This commit is contained in:
parent
2f853a879d
commit
1f08c01cf1
@ -422,6 +422,7 @@ class WikiqParser:
|
||||
}
|
||||
for path in ns_paths.values():
|
||||
Path(path).parent.mkdir(exist_ok=True, parents=True)
|
||||
output_buffers = {ns: [] for ns, path in ns_paths.values()}
|
||||
pq_writers = {
|
||||
ns: pq.ParquetWriter(
|
||||
path, schema, flavor="spark", sorting_columns=sorting_cols
|
||||
@ -430,6 +431,7 @@ class WikiqParser:
|
||||
}
|
||||
|
||||
else:
|
||||
output_buffer = []
|
||||
writer = pacsv.CSVWriter(
|
||||
self.output_file,
|
||||
schema,
|
||||
@ -493,9 +495,9 @@ class WikiqParser:
|
||||
num_context_lines=1000000,
|
||||
max_word_level_diff_complexity=-1,
|
||||
moved_paragraph_detection_cutoff=-1,
|
||||
words_cache_capacity=500,
|
||||
diff_cache_capacity=500,
|
||||
stats_cache_capacity=500,
|
||||
words_cache_capacity=10000,
|
||||
diff_cache_capacity=10000,
|
||||
stats_cache_capacity=10000,
|
||||
)
|
||||
|
||||
while not on_last_batch:
|
||||
@ -677,7 +679,10 @@ class WikiqParser:
|
||||
del row_buffer["text"]
|
||||
|
||||
if self.partition_namespaces is True:
|
||||
output_buffer = output_buffers[page.mwpage.namespace]
|
||||
writer = pq_writers[page.mwpage.namespace]
|
||||
output_buffer += row_buffer
|
||||
if(len(output_buffer) >
|
||||
writer.write(pa.record_batch(row_buffer, schema=schema))
|
||||
gc.collect()
|
||||
page_count += 1
|
||||
@ -875,7 +880,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
dest="batch_size",
|
||||
default=1500,
|
||||
default=16000,
|
||||
type=int,
|
||||
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
||||
)
|
||||
|
@ -339,9 +339,9 @@ class WikiDiffMatcher:
|
||||
num_context_lines=1000000,
|
||||
max_word_level_diff_complexity=-1,
|
||||
moved_paragraph_detection_cutoff=-1,
|
||||
words_cache_capacity=5000,
|
||||
diff_cache_capacity=5000,
|
||||
stats_cache_capacity=5000,
|
||||
words_cache_capacity=10000,
|
||||
diff_cache_capacity=10000,
|
||||
stats_cache_capacity=10000,
|
||||
)
|
||||
self.last_diff = None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user