try fixing the memory problem.

This commit is contained in:
Nathan TeBlunthuis 2025-07-14 18:58:27 -07:00
parent 76d54ae597
commit e53e7ada5d
2 changed files with 113 additions and 97 deletions

View File

@ -7,6 +7,7 @@ requires-python = ">=3.9"
dependencies = [ dependencies = [
"deltas>=0.7.0", "deltas>=0.7.0",
"mediawiki-utilities>=0.4.18", "mediawiki-utilities>=0.4.18",
"more-itertools>=10.7.0",
"mwpersistence>=0.2.4", "mwpersistence>=0.2.4",
"mwreverts>=0.1.5", "mwreverts>=0.1.5",
"mwtypes>=0.4.0", "mwtypes>=0.4.0",

View File

@ -11,6 +11,7 @@ import sys
from collections import deque from collections import deque
from hashlib import sha1 from hashlib import sha1
from io import TextIOWrapper from io import TextIOWrapper
from more_itertools import chunked
from itertools import groupby from itertools import groupby
from subprocess import PIPE, Popen from subprocess import PIPE, Popen
from typing import IO, Any, Generator, TextIO, Union from typing import IO, Any, Generator, TextIO, Union
@ -215,7 +216,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None, namespaces: Union[list[int], None] = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
parquet_buffer_size: int = 2000, buffer_size: int = 200,
partition_namespaces: bool = False, partition_namespaces: bool = False,
): ):
@ -243,12 +244,13 @@ class WikiqParser:
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
# here we initialize the variables we need for output. # here we initialize the variables we need for output.
self.buffer_size = buffer_size
if output_parquet is True: if output_parquet is True:
self.output_parquet = True self.output_parquet = True
self.pq_writer = None self.pq_writer = None
self.output_file = output_file self.output_file = output_file
self.parquet_buffer = [] self.parquet_buffer = []
self.parquet_buffer_size = parquet_buffer_size
else: else:
self.print_header = True self.print_header = True
if output_file == sys.stdout.buffer: if output_file == sys.stdout.buffer:
@ -400,9 +402,8 @@ class WikiqParser:
regex_matches = {} regex_matches = {}
# Iterate through pages # Iterate through pages; note that collapse_revs has already been applied.
for page in dump: for page in dump:
revision_texts = []
# skip namespaces not in the filter # skip namespaces not in the filter
if self.namespace_filter is not None: if self.namespace_filter is not None:
@ -415,123 +416,137 @@ class WikiqParser:
else: else:
reverts_column.rev_detector = None reverts_column.rev_detector = None
# Iterate through a page's revisions # Iterate through a page in batches of edit (sessions)
for revs in page: batches = chunked(page, self.buffer_size)
# Revisions may or may not be grouped into lists of contiguous revisions by the last_rev_text = None
# same user. We call these "edit sessions". Otherwise revs is a list containing for batch in batches:
# exactly one revision. output_buffer = None
revs = list(revs) revision_texts = [] if last_rev_text is None else [last_rev_text]
revs = fix_hex_digests(revs)
table.add(page.mwpage, revs) for revs in batch:
revs = fix_hex_digests(list(revs))
# Revisions may or may not be grouped into lists of contiguous revisions by the
# same user. We call these "edit sessions". Otherwise revs is a list containing
# exactly one revision.
# This is very broken; we can't bring all revisions into memory at once.
# reverts_column.reset()
table.add(page.mwpage, revs)
# if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): # if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
# redirect = True # redirect = True
# else: # else:
# redirect = False # redirect = False
# TODO missing: additions_size deletions_size
# TODO missing: additions_size deletions_size rev_count += 1
rev_count += 1 # Get the last revision in the edit session.
rev = revs[-1]
regex_dict = self.matchmake_revision(rev)
for k, v in regex_dict.items():
if regex_matches.get(k) is None:
regex_matches[k] = []
regex_matches[k].append(v)
# Get the last revision in the edit session. revision_texts.append(rev.text)
rev = revs[-1] last_rev_text = rev.text
regex_dict = self.matchmake_revision(rev)
for k, v in regex_dict.items():
if regex_matches.get(k) is None:
regex_matches[k] = []
regex_matches[k].append(v)
revision_texts.append(rev.text) wikidiff_matcher = None
if self.diff or self.persist == PersistMethod.wikidiff2:
wikidiff_matcher = WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split,
)
wikidiff_matcher = None # Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
if self.diff or self.persist == PersistMethod.wikidiff2: row_buffer = table.pop()
wikidiff_matcher = WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split,
)
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them. if self.diff:
row_buffer = table.pop() row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts) - (1 if last_rev_text is not None else 0))]
is_revert_column: list[Union[bool, None]] = []
if self.diff: print("row_buffer:deleted" + str(row_buffer['deleted']), file=open("debug",'w'))
row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))] print("row_buffer:reverteds" + str(row_buffer['reverteds']), file=open("debug",'a'))
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
is_revert_column: list[Union[bool, None]] = [] if self.revert_radius == 0 or d:
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): is_revert_column.append(None)
if self.revert_radius == 0 or d: else:
is_revert_column.append(None) is_revert_column.append(r is not None)
else:
is_revert_column.append(r is not None)
row_buffer['revert'] = is_revert_column
for k, v in regex_matches.items():
row_buffer[k] = v
regex_matches = {}
if self.persist != PersistMethod.none: row_buffer['revert'] = is_revert_column
window = deque(maxlen=PERSISTENCE_RADIUS) print("row_buffer:revert" + str(row_buffer['revert']),file=open("debug",'a'))
for k, v in regex_matches.items():
row_buffer[k] = v
regex_matches = {}
row_buffer['token_revs'] = [] if self.persist != PersistMethod.none:
row_buffer['tokens_added'] = [] window = deque(maxlen=PERSISTENCE_RADIUS)
row_buffer['tokens_removed'] = []
row_buffer['tokens_window'] = []
if self.persist == PersistMethod.sequence: row_buffer['token_revs'] = []
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), row_buffer['tokens_added'] = []
revert_radius=PERSISTENCE_RADIUS) row_buffer['tokens_removed'] = []
elif self.persist == PersistMethod.segment: row_buffer['tokens_window'] = []
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff2:
state = mwpersistence.DiffState(wikidiff_matcher, if self.persist == PersistMethod.sequence:
revert_radius=PERSISTENCE_RADIUS) state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
else: revert_radius=PERSISTENCE_RADIUS)
from mw.lib import persistence elif self.persist == PersistMethod.segment:
state = persistence.State() state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff2:
state = mwpersistence.DiffState(wikidiff_matcher,
revert_radius=PERSISTENCE_RADIUS)
else:
from mw.lib import persistence
state = persistence.State()
for idx, text in enumerate(row_buffer['text']): for idx, text in enumerate(revision_texts[(0 if last_rev_text is not None else 1):]):
rev_id = row_buffer['revid'][idx] rb_idx = idx - (0 if last_rev_text is not None else 1)
if self.persist != PersistMethod.legacy: rev_id = row_buffer['revid'][rb_idx]
_, tokens_added, tokens_removed = state.update(text, rev_id) if self.persist != PersistMethod.legacy:
else: _, tokens_added, tokens_removed = state.update(text, rev_id)
_, tokens_added, tokens_removed = state.process(text, rev_id) else:
_, tokens_added, tokens_removed = state.process(text, rev_id)
window.append((rev_id, tokens_added, tokens_removed)) window.append((rev_id, tokens_added, tokens_removed))
if len(window) == PERSISTENCE_RADIUS: if len(window) == PERSISTENCE_RADIUS:
old_rev_id, old_tokens_added, old_tokens_removed = window.popleft() old_rev_id, old_tokens_added, old_tokens_removed = window.popleft()
num_token_revs, num_tokens = calculate_persistence(old_tokens_added) num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
row_buffer['token_revs'].append(num_token_revs)
row_buffer['tokens_added'].append(num_tokens)
row_buffer['tokens_removed'].append(len(old_tokens_removed))
row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1)
# print out metadata for the last RADIUS revisions
for i, item in enumerate(window):
# if the window was full, we've already printed item 0
if len(window) == PERSISTENCE_RADIUS and i == 0:
continue
rev_id, tokens_added, tokens_removed = item
num_token_revs, num_tokens = calculate_persistence(tokens_added)
row_buffer['token_revs'].append(num_token_revs) row_buffer['token_revs'].append(num_token_revs)
row_buffer['tokens_added'].append(num_tokens) row_buffer['tokens_added'].append(num_tokens)
row_buffer['tokens_removed'].append(len(old_tokens_removed)) row_buffer['tokens_removed'].append(len(tokens_removed))
row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) row_buffer['tokens_window'].append(len(window) - (i + 1))
# print out metadata for the last RADIUS revisions if not self.text:
for i, item in enumerate(window): del row_buffer['text']
# if the window was full, we've already printed item 0
if len(window) == PERSISTENCE_RADIUS and i == 0:
continue
rev_id, tokens_added, tokens_removed = item if self.partition_namespaces is True:
num_token_revs, num_tokens = calculate_persistence(tokens_added) writer = pq_writers[page.mwpage.namespace]
row_buffer['token_revs'].append(num_token_revs) print("output_buffer:" + str(output_buffer), file=open('debug','a'))
row_buffer['tokens_added'].append(num_tokens) if output_buffer is None:
row_buffer['tokens_removed'].append(len(tokens_removed)) output_buffer = row_buffer
row_buffer['tokens_window'].append(len(window) - (i + 1)) else:
[output_buffer[k].extend(row_buffer[k]) for k in output_buffer.keys()]
if not self.text: print("output_buffer:" + str(output_buffer), file=open('debug','a'))
del row_buffer['text']
if self.partition_namespaces is True:
writer = pq_writers[page.mwpage.namespace]
writer.write(pa.table(row_buffer, schema=schema))
record_batch = pa.record_batch(output_buffer, schema=schema)
writer.write_batch(record_batch)
page_count += 1 page_count += 1
print("Done: %s revisions and %s pages." % (rev_count, page_count), print("Done: %s revisions and %s pages." % (rev_count, page_count),