Add per-namespace resume support for partitioned parquet output.

- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid))
  to correctly handle interleaved dump ordering in partitioned output
- Extract resume functionality to dedicated resume.py module
- Add graceful shutdown handling via shutdown_requested flag (CLI-level only)
- Use lazy ParquetWriter creation to avoid empty files on early exit
- Refactor writing logic to _write_batch() helper method
- Simplify control flow by replacing continue statements with should_write flag
This commit is contained in:
Nathan TeBlunthuis
2025-12-06 06:56:19 -08:00
parent d69d8b0df2
commit 577ddc87f5
3 changed files with 632 additions and 325 deletions

View File

@@ -8,6 +8,7 @@ import gc
import json
import os.path
import re
import signal
import sys
from collections import deque
from hashlib import sha1
@@ -28,6 +29,11 @@ import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
from wikiq.wikitext_parser import WikitextParser
from wikiq.resume import (
get_resume_point,
setup_resume_temp_output,
finalize_resume_merge,
)
TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
@@ -244,7 +250,7 @@ class WikiqParser:
output_parquet: bool = True,
batch_size: int = 1024,
partition_namespaces: bool = False,
resume_from_revid: int = None,
resume_point: Union[tuple, dict, None] = None,
external_links: bool = False,
citations: bool = False,
wikilinks: bool = False,
@@ -254,7 +260,10 @@ class WikiqParser:
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
resume_from_revid : if set, skip all revisions up to and including this revid
resume_point : if set, either a (pageid, revid) tuple for single-file output,
or a dict mapping namespace -> (pageid, revid) for partitioned output.
For single-file: skip all revisions up to
and including this point
"""
self.input_file = input_file
@@ -265,12 +274,13 @@ class WikiqParser:
self.diff = diff
self.text = text
self.partition_namespaces = partition_namespaces
self.resume_from_revid = resume_from_revid
self.resume_point = resume_point
self.external_links = external_links
self.citations = citations
self.wikilinks = wikilinks
self.templates = templates
self.headings = headings
self.shutdown_requested = False
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@@ -299,6 +309,27 @@ class WikiqParser:
else:
self.output_file = open(output_file, "wb")
def request_shutdown(self):
"""Request graceful shutdown. The process() method will exit after completing the current batch."""
self.shutdown_requested = True
def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace=None):
"""Write a batch of rows to the appropriate writer.
For partitioned output, creates writer lazily if needed.
Returns the writer used (for non-partitioned output, same as input).
"""
if self.partition_namespaces and namespace is not None:
if namespace not in pq_writers:
ns_path = ns_paths[namespace]
Path(ns_path).parent.mkdir(exist_ok=True, parents=True)
pq_writers[namespace] = pq.ParquetWriter(
ns_path, schema, flavor="spark", sorting_columns=sorting_cols
)
writer = pq_writers[namespace]
writer.write(pa.record_batch(row_buffer, schema=schema))
return writer
def make_matchmake_pairs(self, patterns, labels) -> list[RegexPair]:
if (patterns is not None and labels is not None) and (
len(patterns) == len(labels)
@@ -358,26 +389,22 @@ class WikiqParser:
# input_filename)
# Track whether we've passed the resume point
found_resume_point = self.resume_from_revid is None
# For partitioned output, this is a dict mapping namespace -> bool
if self.resume_point is None:
found_resume_point = True
elif self.partition_namespaces:
found_resume_point = {}
else:
found_resume_point = False
# When resuming with parquet, write new data to temp file/directory and merge at the end
original_output_file = None
temp_output_file = None
if self.resume_from_revid is not None and self.output_parquet:
if isinstance(self.output_file, str) and os.path.exists(self.output_file):
original_output_file = self.output_file
# For partitioned namespaces, create a temp directory; for single files, create a temp file path
temp_output_file = self.output_file + ".resume_temp"
# Remove temp file/dir if it exists from a previous failed run
if os.path.exists(temp_output_file):
import shutil
if os.path.isdir(temp_output_file):
shutil.rmtree(temp_output_file)
else:
os.remove(temp_output_file)
# For partitioned namespaces, create the directory now; for single files it will be created by ParquetWriter
if self.partition_namespaces:
os.makedirs(temp_output_file, exist_ok=True)
original_partition_dir = None
if self.resume_point is not None and self.output_parquet:
original_output_file, temp_output_file, original_partition_dir = \
setup_resume_temp_output(self.output_file, self.partition_namespaces)
if temp_output_file is not None:
self.output_file = temp_output_file
# Construct dump file iterator
@@ -485,6 +512,8 @@ class WikiqParser:
flavor="spark",
sorting_columns=sorting_cols,
)
ns_paths = {}
pq_writers = {}
else:
output_path = Path(self.output_file)
if self.namespace_filter is not None:
@@ -495,14 +524,9 @@ class WikiqParser:
ns: (output_path.parent / f"namespace={ns}") / output_path.name
for ns in namespaces
}
for path in ns_paths.values():
Path(path).parent.mkdir(exist_ok=True, parents=True)
pq_writers = {
ns: pq.ParquetWriter(
path, schema, flavor="spark", sorting_columns=sorting_cols
)
for ns, path in ns_paths.items()
}
# Writers are created lazily when first needed to avoid empty files on early exit
pq_writers = {}
writer = None # Not used for partitioned output
else:
writer = pacsv.CSVWriter(
@@ -510,6 +534,9 @@ class WikiqParser:
schema,
write_options=pacsv.WriteOptions(delimiter="\t"),
)
ns_paths = {}
pq_writers = {}
sorting_cols = None
regex_matches = {}
@@ -522,6 +549,42 @@ class WikiqParser:
if page.mwpage.namespace not in self.namespace_filter:
continue
# Resume logic: skip pages that come before the resume point.
# For partitioned output, each namespace has its own resume point.
is_resume_page = False
page_resume_point = None
if self.resume_point is not None:
page_id = page.mwpage.id
page_ns = page.mwpage.namespace
if self.partition_namespaces:
# Per-namespace resume: check if we've passed this namespace's resume point
if found_resume_point.get(page_ns, False):
pass # Already past resume point for this namespace
elif page_ns not in self.resume_point:
# No resume point for this namespace, process normally
found_resume_point[page_ns] = True
else:
resume_pageid, resume_revid = self.resume_point[page_ns]
if page_id < resume_pageid:
continue
elif page_id == resume_pageid:
is_resume_page = True
page_resume_point = (resume_pageid, resume_revid)
else:
found_resume_point[page_ns] = True
else:
# Single-file resume: global resume point
if not found_resume_point:
resume_pageid, resume_revid = self.resume_point
if page_id < resume_pageid:
continue
elif page_id == resume_pageid:
is_resume_page = True
page_resume_point = (resume_pageid, resume_revid)
else:
found_resume_point = True
# Disable detecting reverts if radius is 0.
if self.revert_radius > 0:
reverts_column.rev_detector = mwreverts.Detector(
@@ -602,28 +665,6 @@ class WikiqParser:
n_revs = 0
# If we're resuming and haven't found the resume point yet, check this batch
skip_batch = False
if not found_resume_point and self.resume_from_revid is not None:
batch_has_resume_point = False
for revs in batch:
revs_list = list(revs)
for rev in revs_list:
if rev.id == self.resume_from_revid:
batch_has_resume_point = True
found_resume_point = True
print(f"Found resume point at revid {self.resume_from_revid}", file=sys.stderr)
break
if batch_has_resume_point:
break
# If this batch doesn't contain the resume point, skip it entirely
if not batch_has_resume_point:
skip_batch = True
if skip_batch:
continue
for revs in batch:
# Revisions may or may not be grouped into lists of contiguous revisions by the
# same user. We call these "edit sessions". Otherwise revs is a list containing
@@ -650,6 +691,15 @@ class WikiqParser:
regex_matches[k] = []
regex_matches[k].append(v)
# Check for shutdown after each revision
if self.shutdown_requested:
break
# If shutdown requested, skip all remaining processing and close writers
if self.shutdown_requested:
print("Shutdown requested, closing writers...", file=sys.stderr)
break
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
batch_row_buffer = table.pop()
if self.persist != PersistMethod.none:
@@ -790,31 +840,39 @@ class WikiqParser:
if not self.text and self.persist != PersistMethod.none:
del row_buffer["text"]
# If we just found the resume point in this batch, filter to only write revisions after it
if self.resume_from_revid is not None:
# Filter for resume logic if on resume page
should_write = True
if is_resume_page:
_, resume_revid = page_resume_point
revids = row_buffer["revid"]
# Find the index of the resume revid
resume_idx = None
for idx, revid in enumerate(revids):
if revid == self.resume_from_revid:
resume_idx = idx
break
resume_idx = next((i for i, r in enumerate(revids) if r == resume_revid), None)
if resume_idx is not None:
# Mark resume point as found
if self.partition_namespaces:
found_resume_point[page.mwpage.namespace] = True
else:
found_resume_point = True
is_resume_page = False
# Only write revisions after the resume point
if resume_idx + 1 < len(revids):
row_buffer = {k: v[resume_idx + 1:] for k, v in row_buffer.items()}
print(f"Resuming output starting at revid {row_buffer['revid'][0]}", file=sys.stderr)
else:
# The resume point was the last revision in this batch, skip writing
continue
should_write = False
else:
should_write = False
# Only write if there are rows to write
if len(row_buffer.get("revid", [])) > 0:
if self.partition_namespaces is True:
writer = pq_writers[page.mwpage.namespace]
writer.write(pa.record_batch(row_buffer, schema=schema))
# Write batch if there are rows
if should_write and len(row_buffer.get("revid", [])) > 0:
namespace = page.mwpage.namespace if self.partition_namespaces else None
self._write_batch(row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace)
gc.collect()
# If shutdown was requested, break from page loop
if self.shutdown_requested:
break
page_count += 1
print(
@@ -829,79 +887,12 @@ class WikiqParser:
# If we were resuming, merge the original file with the new temp file
if original_output_file is not None and temp_output_file is not None:
print("Merging resumed data with existing output...", file=sys.stderr)
try:
# Check if we're merging partitioned namespaces or single files
if os.path.isdir(original_output_file):
# Merge partitioned namespace directories
self._merge_partitioned_namespaces(original_output_file, temp_output_file)
else:
# Merge single parquet files
merged_output_file = original_output_file + ".merged"
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
# Replace the original file with the merged file
os.remove(original_output_file)
os.rename(merged_output_file, original_output_file)
# Clean up the temp file/directory
if os.path.exists(temp_output_file):
if os.path.isdir(temp_output_file):
import shutil
shutil.rmtree(temp_output_file)
else:
os.remove(temp_output_file)
print("Merge complete.", file=sys.stderr)
except Exception as e:
print(f"Error merging resume data: {e}", file=sys.stderr)
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
raise
def _merge_partitioned_namespaces(self, original_output_dir, temp_output_dir):
"""
Merge partitioned namespace directories.
For each namespace partition in the temp directory, merge its parquet files with the original.
"""
import shutil
# Get all namespace directories from temp
temp_namespace_dirs = [d for d in os.listdir(temp_output_dir) if d.startswith('namespace=')]
for ns_dir in temp_namespace_dirs:
temp_ns_path = os.path.join(temp_output_dir, ns_dir)
original_ns_path = os.path.join(original_output_dir, ns_dir)
# Find parquet files in the temp namespace directory
temp_parquet_files = [f for f in os.listdir(temp_ns_path) if f.endswith('.parquet')]
if not temp_parquet_files:
continue
temp_parquet_path = os.path.join(temp_ns_path, temp_parquet_files[0])
# Check if the namespace partition exists in the original directory
if os.path.exists(original_ns_path):
# Namespace partition exists, merge the files
original_parquet_files = [f for f in os.listdir(original_ns_path) if f.endswith('.parquet')]
if not original_parquet_files:
# No parquet file in original, just copy the temp file
shutil.copy(temp_parquet_path, os.path.join(original_ns_path, temp_parquet_files[0]))
else:
original_parquet_path = os.path.join(original_ns_path, original_parquet_files[0])
merged_parquet_path = original_parquet_path + ".merged"
# Merge the files
merge_parquet_files(original_parquet_path, temp_parquet_path, merged_parquet_path)
# Replace the original file with the merged file
os.remove(original_parquet_path)
os.rename(merged_parquet_path, original_parquet_path)
else:
# Namespace partition doesn't exist in original, create it
shutil.copytree(temp_ns_path, original_ns_path)
finalize_resume_merge(
original_output_file,
temp_output_file,
self.partition_namespaces,
original_partition_dir
)
def match_archive_suffix(input_filename):
if re.match(r".*\.7z$", input_filename):
@@ -942,111 +933,6 @@ def open_output_file(input_filename):
return output_file
def merge_parquet_files(original_path, temp_path, merged_path):
"""
Merge two parquet files by copying all row groups from original and temp into merged.
"""
original_pq = pq.ParquetFile(original_path)
temp_pq = pq.ParquetFile(temp_path)
merged_writer = None
# Copy all row groups from the original file
for i in range(original_pq.num_row_groups):
row_group = original_pq.read_row_group(i)
if merged_writer is None:
merged_writer = pq.ParquetWriter(
merged_path,
row_group.schema,
flavor="spark"
)
merged_writer.write_table(row_group)
# Append all row groups from the temp file
for i in range(temp_pq.num_row_groups):
row_group = temp_pq.read_row_group(i)
merged_writer.write_table(row_group)
# Close the writer
if merged_writer is not None:
merged_writer.close()
def get_last_revid_from_parquet(output_file):
"""
Read the last revid from a parquet file or partitioned namespace directory.
Returns None if the file doesn't exist or is empty.
Handles both single files and partitioned namespace structures (namespace=*/file.parquet).
For partitioned namespaces, finds the most recently modified partition and reads from it.
"""
try:
if not os.path.exists(output_file):
return None
# Check if this is a partitioned namespace directory
if os.path.isdir(output_file):
# Find all namespace=* subdirectories
namespace_dirs = [d for d in os.listdir(output_file) if d.startswith('namespace=')]
if not namespace_dirs:
return None
# Find the most recently modified namespace partition
most_recent_ns = None
most_recent_mtime = -1
for ns_dir in namespace_dirs:
ns_path = os.path.join(output_file, ns_dir)
mtime = os.path.getmtime(ns_path)
if mtime > most_recent_mtime:
most_recent_mtime = mtime
most_recent_ns = ns_path
if most_recent_ns is None:
return None
# Find the parquet file in the most recent namespace directory
parquet_files = [f for f in os.listdir(most_recent_ns) if f.endswith('.parquet')]
if not parquet_files:
return None
parquet_path = os.path.join(most_recent_ns, parquet_files[0])
parquet_file = pq.ParquetFile(parquet_path)
if parquet_file.num_row_groups == 0:
return None
# Read only the last row group, and only the revid column
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
if last_row_group.num_rows == 0:
return None
# Get the last revid from this row group
last_revid = last_row_group.column('revid')[-1].as_py()
return last_revid
else:
# Single parquet file
parquet_file = pq.ParquetFile(output_file)
if parquet_file.num_row_groups == 0:
return None
# Read only the last row group, and only the revid column
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
if last_row_group.num_rows == 0:
return None
# Get the last revid from this row group
last_revid = last_row_group.column('revid')[-1].as_py()
return last_revid
except Exception as e:
print(f"Error reading last revid from {output_file}: {e}", file=sys.stderr)
return None
def main():
parser = argparse.ArgumentParser(
description="Parse MediaWiki XML database dumps into tab delimited data."
@@ -1291,16 +1177,29 @@ def main():
output_file = output
# Handle resume functionality
resume_from_revid = None
resume_point = None
if args.resume:
if output_parquet and not args.stdout:
resume_from_revid = get_last_revid_from_parquet(output_file)
if resume_from_revid is not None:
print(f"Resuming from last written revid: {resume_from_revid}", file=sys.stderr)
resume_point = get_resume_point(output_file, args.partition_namespaces)
if resume_point is not None:
if args.partition_namespaces:
# Dict mapping namespace -> (pageid, revid)
ns_list = sorted(resume_point.keys())
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
for ns in ns_list:
pageid, revid = resume_point[ns]
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
pageid, revid = resume_point
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
print("Resume requested but no existing output file found, starting from beginning", file=sys.stderr)
if args.partition_namespaces:
partition_dir = os.path.dirname(output_file)
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
else:
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
else:
print("Warning: --resume only works with parquet output (not stdout or TSV)", file=sys.stderr)
sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")
wikiq = WikiqParser(
input_file,
@@ -1318,7 +1217,7 @@ def main():
output_parquet=output_parquet,
partition_namespaces=args.partition_namespaces,
batch_size=args.batch_size,
resume_from_revid=resume_from_revid,
resume_point=resume_point,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
@@ -1326,7 +1225,23 @@ def main():
headings=args.headings,
)
wikiq.process()
# Register signal handlers for graceful shutdown (CLI only)
def handle_shutdown(signum, frame):
sig_name = signal.Signals(signum).name
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
wikiq.request_shutdown()
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
try:
wikiq.process()
finally:
# Restore original signal handlers
signal.signal(signal.SIGTERM, original_sigterm)
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGUSR1, original_sigusr1)
# close things
input_file.close()
@@ -1350,7 +1265,7 @@ def main():
diff=args.diff,
text=args.text,
batch_size=args.batch_size,
resume_from_revid=None,
resume_point=None,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
@@ -1358,7 +1273,23 @@ def main():
headings=args.headings,
)
wikiq.process()
# Register signal handlers for graceful shutdown (CLI only)
def handle_shutdown(signum, frame):
sig_name = signal.Signals(signum).name
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
wikiq.request_shutdown()
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
try:
wikiq.process()
finally:
# Restore original signal handlers
signal.signal(signal.SIGTERM, original_sigterm)
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGUSR1, original_sigusr1)
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
# stop_words = stop_words.split(",")