increase cache size.
This commit is contained in:
parent
2f853a879d
commit
1f08c01cf1
@ -422,6 +422,7 @@ class WikiqParser:
|
|||||||
}
|
}
|
||||||
for path in ns_paths.values():
|
for path in ns_paths.values():
|
||||||
Path(path).parent.mkdir(exist_ok=True, parents=True)
|
Path(path).parent.mkdir(exist_ok=True, parents=True)
|
||||||
|
output_buffers = {ns: [] for ns, path in ns_paths.values()}
|
||||||
pq_writers = {
|
pq_writers = {
|
||||||
ns: pq.ParquetWriter(
|
ns: pq.ParquetWriter(
|
||||||
path, schema, flavor="spark", sorting_columns=sorting_cols
|
path, schema, flavor="spark", sorting_columns=sorting_cols
|
||||||
@ -430,6 +431,7 @@ class WikiqParser:
|
|||||||
}
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
output_buffer = []
|
||||||
writer = pacsv.CSVWriter(
|
writer = pacsv.CSVWriter(
|
||||||
self.output_file,
|
self.output_file,
|
||||||
schema,
|
schema,
|
||||||
@ -493,9 +495,9 @@ class WikiqParser:
|
|||||||
num_context_lines=1000000,
|
num_context_lines=1000000,
|
||||||
max_word_level_diff_complexity=-1,
|
max_word_level_diff_complexity=-1,
|
||||||
moved_paragraph_detection_cutoff=-1,
|
moved_paragraph_detection_cutoff=-1,
|
||||||
words_cache_capacity=500,
|
words_cache_capacity=10000,
|
||||||
diff_cache_capacity=500,
|
diff_cache_capacity=10000,
|
||||||
stats_cache_capacity=500,
|
stats_cache_capacity=10000,
|
||||||
)
|
)
|
||||||
|
|
||||||
while not on_last_batch:
|
while not on_last_batch:
|
||||||
@ -677,7 +679,10 @@ class WikiqParser:
|
|||||||
del row_buffer["text"]
|
del row_buffer["text"]
|
||||||
|
|
||||||
if self.partition_namespaces is True:
|
if self.partition_namespaces is True:
|
||||||
|
output_buffer = output_buffers[page.mwpage.namespace]
|
||||||
writer = pq_writers[page.mwpage.namespace]
|
writer = pq_writers[page.mwpage.namespace]
|
||||||
|
output_buffer += row_buffer
|
||||||
|
if(len(output_buffer) >
|
||||||
writer.write(pa.record_batch(row_buffer, schema=schema))
|
writer.write(pa.record_batch(row_buffer, schema=schema))
|
||||||
gc.collect()
|
gc.collect()
|
||||||
page_count += 1
|
page_count += 1
|
||||||
@ -875,7 +880,7 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch-size",
|
"--batch-size",
|
||||||
dest="batch_size",
|
dest="batch_size",
|
||||||
default=1500,
|
default=16000,
|
||||||
type=int,
|
type=int,
|
||||||
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
|
||||||
)
|
)
|
||||||
|
@ -339,9 +339,9 @@ class WikiDiffMatcher:
|
|||||||
num_context_lines=1000000,
|
num_context_lines=1000000,
|
||||||
max_word_level_diff_complexity=-1,
|
max_word_level_diff_complexity=-1,
|
||||||
moved_paragraph_detection_cutoff=-1,
|
moved_paragraph_detection_cutoff=-1,
|
||||||
words_cache_capacity=5000,
|
words_cache_capacity=10000,
|
||||||
diff_cache_capacity=5000,
|
diff_cache_capacity=10000,
|
||||||
stats_cache_capacity=5000,
|
stats_cache_capacity=10000,
|
||||||
)
|
)
|
||||||
self.last_diff = None
|
self.last_diff = None
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user