configure pywikidiff2 cache limits.
This commit is contained in:
		
							parent
							
								
									83c92d1a37
								
							
						
					
					
						commit
						54e996b910
					
				| @ -3,8 +3,8 @@ | |||||||
| # original wikiq headers are: title articleid revid date_time anon | # original wikiq headers are: title articleid revid date_time anon | ||||||
| # editor editor_id minor text_size text_entropy text_md5 reversion | # editor editor_id minor text_size text_entropy text_md5 reversion | ||||||
| # additions_size deletions_size | # additions_size deletions_size | ||||||
| import gc |  | ||||||
| import argparse | import argparse | ||||||
|  | import gc | ||||||
| import json | import json | ||||||
| import os.path | import os.path | ||||||
| import re | import re | ||||||
| @ -465,7 +465,6 @@ class WikiqParser: | |||||||
|             next_batch = {} |             next_batch = {} | ||||||
|             diff_dict = {} |             diff_dict = {} | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|             if self.persist != PersistMethod.none: |             if self.persist != PersistMethod.none: | ||||||
|                 window = deque(maxlen=PERSISTENCE_RADIUS) |                 window = deque(maxlen=PERSISTENCE_RADIUS) | ||||||
|                 if self.persist != PersistMethod.none: |                 if self.persist != PersistMethod.none: | ||||||
| @ -488,11 +487,14 @@ class WikiqParser: | |||||||
|                         from mw.lib import persistence |                         from mw.lib import persistence | ||||||
| 
 | 
 | ||||||
|                         persist_state = persistence.State() |                         persist_state = persistence.State() | ||||||
| 
 | q | ||||||
| 
 |  | ||||||
|             if self.diff: |             if self.diff: | ||||||
|                 differ = pywikidiff2.pywikidiff2( |                 differ = pywikidiff2.pywikidiff2( | ||||||
|                     numContextLines=1000000, moved_paragraph_detection_cutoff=2000 |                     numContextLines=1000000, | ||||||
|  |                     moved_paragraph_detection_cutoff=200000, | ||||||
|  |                     words_cache_capacity=10000, | ||||||
|  |                     diff_cache_capacity=10000, | ||||||
|  |                     stats_cache_capacity=100000, | ||||||
|                 ) |                 ) | ||||||
| 
 | 
 | ||||||
|             while not on_last_batch: |             while not on_last_batch: | ||||||
| @ -595,7 +597,6 @@ class WikiqParser: | |||||||
|                     row_buffer[k] = v |                     row_buffer[k] = v | ||||||
|                     regex_matches = {} |                     regex_matches = {} | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|                 # begin persistence logic |                 # begin persistence logic | ||||||
|                 if self.persist != PersistMethod.none: |                 if self.persist != PersistMethod.none: | ||||||
|                     row_buffer["token_revs"] = [] |                     row_buffer["token_revs"] = [] | ||||||
| @ -937,7 +938,7 @@ def main(): | |||||||
|                 diff=args.diff, |                 diff=args.diff, | ||||||
|                 output_parquet=output_parquet, |                 output_parquet=output_parquet, | ||||||
|                 partition_namespaces=args.partition_namespaces, |                 partition_namespaces=args.partition_namespaces, | ||||||
|                 batch_size = args.batch_size |                 batch_size=args.batch_size, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             wikiq.process() |             wikiq.process() | ||||||
| @ -960,7 +961,7 @@ def main(): | |||||||
|             regex_comment_label=args.regex_comment_label, |             regex_comment_label=args.regex_comment_label, | ||||||
|             diff=args.diff, |             diff=args.diff, | ||||||
|             text=args.text, |             text=args.text, | ||||||
|             batch_size=args.batch_size |             batch_size=args.batch_size, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         wikiq.process() |         wikiq.process() | ||||||
|  | |||||||
| @ -330,15 +330,17 @@ class WikiDiffMatcher: | |||||||
|     ): |     ): | ||||||
|         self.tokenizer = tokenizer or TOKENIZER |         self.tokenizer = tokenizer or TOKENIZER | ||||||
| 
 | 
 | ||||||
|              |  | ||||||
| 
 |  | ||||||
|     class Processor(DiffEngine.Processor): |     class Processor(DiffEngine.Processor): | ||||||
|         def __init__(self, tokenizer=None): |         def __init__(self, tokenizer=None): | ||||||
|             self.tokenizer = tokenizer or TOKENIZER |             self.tokenizer = tokenizer or TOKENIZER | ||||||
|             self.last_tokens = [] |             self.last_tokens = [] | ||||||
|             self.previous_text = "" |             self.previous_text = "" | ||||||
|             self.differ = pywikidiff2.pywikidiff2( |             self.differ = pywikidiff2.pywikidiff2( | ||||||
|                 numContextLines=1000000, moved_paragraph_detection_cutoff=200000 |                 numContextLines=1000000, | ||||||
|  |                 moved_paragraph_detection_cutoff=200000, | ||||||
|  |                 words_cache_capacity=10000, | ||||||
|  |                 diff_cache_capacity=10000, | ||||||
|  |                 stats_cache_capacity=100000, | ||||||
|             ) |             ) | ||||||
|             self.last_diff = None |             self.last_diff = None | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user