improve resume logic.

This commit is contained in:
Nathan TeBlunthuis
2025-12-07 06:06:26 -08:00
parent 577ddc87f5
commit 783f5fd8bc
2 changed files with 165 additions and 54 deletions

View File

@@ -33,6 +33,7 @@ from wikiq.resume import (
get_resume_point,
setup_resume_temp_output,
finalize_resume_merge,
get_checkpoint_path,
)
TO_ENCODE = ("title", "editor")
@@ -309,10 +310,49 @@ class WikiqParser:
else:
self.output_file = open(output_file, "wb")
# Checkpoint file for tracking resume point
self.checkpoint_file = None
self.checkpoint_state = {} # namespace -> (pageid, revid) or None -> (pageid, revid)
def request_shutdown(self):
"""Request graceful shutdown. The process() method will exit after completing the current batch."""
self.shutdown_requested = True
def _open_checkpoint(self, output_file):
"""Open checkpoint file for writing. Keeps file open for performance."""
if not self.output_parquet or output_file == sys.stdout.buffer:
return
checkpoint_path = get_checkpoint_path(output_file)
Path(checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
self.checkpoint_file = open(checkpoint_path, 'w')
print(f"Checkpoint file opened: {checkpoint_path}", file=sys.stderr)
def _update_checkpoint(self, pageid, revid, namespace=None):
"""Update checkpoint state and write to file."""
if self.checkpoint_file is None:
return
if self.partition_namespaces:
self.checkpoint_state[namespace] = {"pageid": pageid, "revid": revid}
else:
self.checkpoint_state = {"pageid": pageid, "revid": revid}
self.checkpoint_file.seek(0)
self.checkpoint_file.truncate()
json.dump(self.checkpoint_state, self.checkpoint_file)
self.checkpoint_file.flush()
def _close_checkpoint(self, delete=False):
"""Close checkpoint file, optionally deleting it."""
if self.checkpoint_file is None:
return
checkpoint_path = self.checkpoint_file.name
self.checkpoint_file.close()
self.checkpoint_file = None
if delete and os.path.exists(checkpoint_path):
os.remove(checkpoint_path)
print(f"Checkpoint file deleted (processing complete): {checkpoint_path}", file=sys.stderr)
else:
print(f"Checkpoint file preserved for resume: {checkpoint_path}", file=sys.stderr)
def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace=None):
"""Write a batch of rows to the appropriate writer.
@@ -407,6 +447,11 @@ class WikiqParser:
if temp_output_file is not None:
self.output_file = temp_output_file
# Open checkpoint file for tracking resume point
# Use original_output_file if resuming, otherwise self.output_file
checkpoint_output = original_output_file if original_output_file else self.output_file
self._open_checkpoint(checkpoint_output)
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
@@ -868,6 +913,10 @@ class WikiqParser:
if should_write and len(row_buffer.get("revid", [])) > 0:
namespace = page.mwpage.namespace if self.partition_namespaces else None
self._write_batch(row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace)
# Update checkpoint with last written position
last_pageid = row_buffer["articleid"][-1]
last_revid = row_buffer["revid"][-1]
self._update_checkpoint(last_pageid, last_revid, namespace)
gc.collect()
# If shutdown was requested, break from page loop
@@ -894,6 +943,9 @@ class WikiqParser:
original_partition_dir
)
# Close checkpoint file; delete it only if we completed without interruption
self._close_checkpoint(delete=not self.shutdown_requested)
def match_archive_suffix(input_filename):
if re.match(r".*\.7z$", input_filename):
cmd = ["7za", "x", "-so", input_filename]
@@ -1155,9 +1207,7 @@ def main():
print(args, file=sys.stderr)
if len(args.dumpfiles) > 0:
for filename in args.dumpfiles:
input_file = open_input_file(filename, args.fandom_2020)
# open directory for output
# Determine output file path before opening input (so resume errors are caught early)
if args.output:
output = args.output[0]
else:
@@ -1165,25 +1215,21 @@ def main():
output_parquet = output.endswith(".parquet")
print("Processing file: %s" % filename, file=sys.stderr)
if args.stdout:
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
output_file = sys.stdout.buffer
elif os.path.isdir(output) or output_parquet:
filename = os.path.join(output, os.path.basename(filename))
output_file = get_output_filename(filename, parquet=output_parquet)
output_filename = os.path.join(output, os.path.basename(filename))
output_file = get_output_filename(output_filename, parquet=output_parquet)
else:
output_file = output
# Handle resume functionality
# Handle resume functionality before opening input file
resume_point = None
if args.resume:
if output_parquet and not args.stdout:
resume_point = get_resume_point(output_file, args.partition_namespaces)
if resume_point is not None:
if args.partition_namespaces:
# Dict mapping namespace -> (pageid, revid)
ns_list = sorted(resume_point.keys())
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
for ns in ns_list:
@@ -1201,6 +1247,10 @@ def main():
else:
sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")
# Now open the input file
print("Processing file: %s" % filename, file=sys.stderr)
input_file = open_input_file(filename, args.fandom_2020)
wikiq = WikiqParser(
input_file,
output_file,