increase cache size.
This commit is contained in:
		
							parent
							
								
									2f853a879d
								
							
						
					
					
						commit
						1f08c01cf1
					
				| @ -422,6 +422,7 @@ class WikiqParser: | |||||||
|                 } |                 } | ||||||
|                 for path in ns_paths.values(): |                 for path in ns_paths.values(): | ||||||
|                     Path(path).parent.mkdir(exist_ok=True, parents=True) |                     Path(path).parent.mkdir(exist_ok=True, parents=True) | ||||||
|  |                 output_buffers = {ns: [] for ns, path in ns_paths.values()} | ||||||
|                 pq_writers = { |                 pq_writers = { | ||||||
|                     ns: pq.ParquetWriter( |                     ns: pq.ParquetWriter( | ||||||
|                         path, schema, flavor="spark", sorting_columns=sorting_cols |                         path, schema, flavor="spark", sorting_columns=sorting_cols | ||||||
| @ -430,6 +431,7 @@ class WikiqParser: | |||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|         else: |         else: | ||||||
|  |             output_buffer = [] | ||||||
|             writer = pacsv.CSVWriter( |             writer = pacsv.CSVWriter( | ||||||
|                 self.output_file, |                 self.output_file, | ||||||
|                 schema, |                 schema, | ||||||
| @ -493,9 +495,9 @@ class WikiqParser: | |||||||
|                     num_context_lines=1000000, |                     num_context_lines=1000000, | ||||||
|                     max_word_level_diff_complexity=-1, |                     max_word_level_diff_complexity=-1, | ||||||
|                     moved_paragraph_detection_cutoff=-1, |                     moved_paragraph_detection_cutoff=-1, | ||||||
|                     words_cache_capacity=500, |                     words_cache_capacity=10000, | ||||||
|                     diff_cache_capacity=500, |                     diff_cache_capacity=10000, | ||||||
|                     stats_cache_capacity=500, |                     stats_cache_capacity=10000, | ||||||
|                 ) |                 ) | ||||||
| 
 | 
 | ||||||
|             while not on_last_batch: |             while not on_last_batch: | ||||||
| @ -677,7 +679,10 @@ class WikiqParser: | |||||||
|                     del row_buffer["text"] |                     del row_buffer["text"] | ||||||
| 
 | 
 | ||||||
|                 if self.partition_namespaces is True: |                 if self.partition_namespaces is True: | ||||||
|  |                     output_buffer = output_buffers[page.mwpage.namespace] | ||||||
|                     writer = pq_writers[page.mwpage.namespace] |                     writer = pq_writers[page.mwpage.namespace] | ||||||
|  |                 output_buffer += row_buffer | ||||||
|  |                 if(len(output_buffer) >  | ||||||
|                 writer.write(pa.record_batch(row_buffer, schema=schema)) |                 writer.write(pa.record_batch(row_buffer, schema=schema)) | ||||||
|                 gc.collect() |                 gc.collect() | ||||||
|             page_count += 1 |             page_count += 1 | ||||||
| @ -875,7 +880,7 @@ def main(): | |||||||
|     parser.add_argument( |     parser.add_argument( | ||||||
|         "--batch-size", |         "--batch-size", | ||||||
|         dest="batch_size", |         dest="batch_size", | ||||||
|         default=1500, |         default=16000, | ||||||
|         type=int, |         type=int, | ||||||
|         help="How many revisions to process in each batch. This ends up being the Parquet row group size", |         help="How many revisions to process in each batch. This ends up being the Parquet row group size", | ||||||
|     ) |     ) | ||||||
|  | |||||||
| @ -339,9 +339,9 @@ class WikiDiffMatcher: | |||||||
|                 num_context_lines=1000000, |                 num_context_lines=1000000, | ||||||
|                 max_word_level_diff_complexity=-1, |                 max_word_level_diff_complexity=-1, | ||||||
|                 moved_paragraph_detection_cutoff=-1, |                 moved_paragraph_detection_cutoff=-1, | ||||||
|                 words_cache_capacity=5000, |                 words_cache_capacity=10000, | ||||||
|                 diff_cache_capacity=5000, |                 diff_cache_capacity=10000, | ||||||
|                 stats_cache_capacity=5000, |                 stats_cache_capacity=10000, | ||||||
|             ) |             ) | ||||||
|             self.last_diff = None |             self.last_diff = None | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user