Add per-namespace resume support for partitioned parquet output.
- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid)) to correctly handle interleaved dump ordering in partitioned output - Extract resume functionality to dedicated resume.py module - Add graceful shutdown handling via shutdown_requested flag (CLI-level only) - Use lazy ParquetWriter creation to avoid empty files on early exit - Refactor writing logic to _write_batch() helper method - Simplify control flow by replacing continue statements with should_write flag
This commit is contained in:
@@ -8,6 +8,7 @@ import gc
|
||||
import json
|
||||
import os.path
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from collections import deque
|
||||
from hashlib import sha1
|
||||
@@ -28,6 +29,11 @@ import wikiq.tables as tables
|
||||
from wikiq.tables import RevisionTable
|
||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||
from wikiq.wikitext_parser import WikitextParser
|
||||
from wikiq.resume import (
|
||||
get_resume_point,
|
||||
setup_resume_temp_output,
|
||||
finalize_resume_merge,
|
||||
)
|
||||
|
||||
TO_ENCODE = ("title", "editor")
|
||||
PERSISTENCE_RADIUS = 7
|
||||
@@ -244,7 +250,7 @@ class WikiqParser:
|
||||
output_parquet: bool = True,
|
||||
batch_size: int = 1024,
|
||||
partition_namespaces: bool = False,
|
||||
resume_from_revid: int = None,
|
||||
resume_point: Union[tuple, dict, None] = None,
|
||||
external_links: bool = False,
|
||||
citations: bool = False,
|
||||
wikilinks: bool = False,
|
||||
@@ -254,7 +260,10 @@ class WikiqParser:
|
||||
"""
|
||||
Parameters:
|
||||
persist : what persistence method to use. Takes a PersistMethod value
|
||||
resume_from_revid : if set, skip all revisions up to and including this revid
|
||||
resume_point : if set, either a (pageid, revid) tuple for single-file output,
|
||||
or a dict mapping namespace -> (pageid, revid) for partitioned output.
|
||||
For single-file: skip all revisions up to
|
||||
and including this point
|
||||
"""
|
||||
self.input_file = input_file
|
||||
|
||||
@@ -265,12 +274,13 @@ class WikiqParser:
|
||||
self.diff = diff
|
||||
self.text = text
|
||||
self.partition_namespaces = partition_namespaces
|
||||
self.resume_from_revid = resume_from_revid
|
||||
self.resume_point = resume_point
|
||||
self.external_links = external_links
|
||||
self.citations = citations
|
||||
self.wikilinks = wikilinks
|
||||
self.templates = templates
|
||||
self.headings = headings
|
||||
self.shutdown_requested = False
|
||||
if namespaces is not None:
|
||||
self.namespace_filter = set(namespaces)
|
||||
else:
|
||||
@@ -299,6 +309,27 @@ class WikiqParser:
|
||||
else:
|
||||
self.output_file = open(output_file, "wb")
|
||||
|
||||
def request_shutdown(self):
|
||||
"""Request graceful shutdown. The process() method will exit after completing the current batch."""
|
||||
self.shutdown_requested = True
|
||||
|
||||
def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace=None):
|
||||
"""Write a batch of rows to the appropriate writer.
|
||||
|
||||
For partitioned output, creates writer lazily if needed.
|
||||
Returns the writer used (for non-partitioned output, same as input).
|
||||
"""
|
||||
if self.partition_namespaces and namespace is not None:
|
||||
if namespace not in pq_writers:
|
||||
ns_path = ns_paths[namespace]
|
||||
Path(ns_path).parent.mkdir(exist_ok=True, parents=True)
|
||||
pq_writers[namespace] = pq.ParquetWriter(
|
||||
ns_path, schema, flavor="spark", sorting_columns=sorting_cols
|
||||
)
|
||||
writer = pq_writers[namespace]
|
||||
writer.write(pa.record_batch(row_buffer, schema=schema))
|
||||
return writer
|
||||
|
||||
def make_matchmake_pairs(self, patterns, labels) -> list[RegexPair]:
|
||||
if (patterns is not None and labels is not None) and (
|
||||
len(patterns) == len(labels)
|
||||
@@ -358,26 +389,22 @@ class WikiqParser:
|
||||
# input_filename)
|
||||
|
||||
# Track whether we've passed the resume point
|
||||
found_resume_point = self.resume_from_revid is None
|
||||
# For partitioned output, this is a dict mapping namespace -> bool
|
||||
if self.resume_point is None:
|
||||
found_resume_point = True
|
||||
elif self.partition_namespaces:
|
||||
found_resume_point = {}
|
||||
else:
|
||||
found_resume_point = False
|
||||
|
||||
# When resuming with parquet, write new data to temp file/directory and merge at the end
|
||||
original_output_file = None
|
||||
temp_output_file = None
|
||||
if self.resume_from_revid is not None and self.output_parquet:
|
||||
if isinstance(self.output_file, str) and os.path.exists(self.output_file):
|
||||
original_output_file = self.output_file
|
||||
# For partitioned namespaces, create a temp directory; for single files, create a temp file path
|
||||
temp_output_file = self.output_file + ".resume_temp"
|
||||
# Remove temp file/dir if it exists from a previous failed run
|
||||
if os.path.exists(temp_output_file):
|
||||
import shutil
|
||||
if os.path.isdir(temp_output_file):
|
||||
shutil.rmtree(temp_output_file)
|
||||
else:
|
||||
os.remove(temp_output_file)
|
||||
# For partitioned namespaces, create the directory now; for single files it will be created by ParquetWriter
|
||||
if self.partition_namespaces:
|
||||
os.makedirs(temp_output_file, exist_ok=True)
|
||||
original_partition_dir = None
|
||||
if self.resume_point is not None and self.output_parquet:
|
||||
original_output_file, temp_output_file, original_partition_dir = \
|
||||
setup_resume_temp_output(self.output_file, self.partition_namespaces)
|
||||
if temp_output_file is not None:
|
||||
self.output_file = temp_output_file
|
||||
|
||||
# Construct dump file iterator
|
||||
@@ -485,6 +512,8 @@ class WikiqParser:
|
||||
flavor="spark",
|
||||
sorting_columns=sorting_cols,
|
||||
)
|
||||
ns_paths = {}
|
||||
pq_writers = {}
|
||||
else:
|
||||
output_path = Path(self.output_file)
|
||||
if self.namespace_filter is not None:
|
||||
@@ -495,14 +524,9 @@ class WikiqParser:
|
||||
ns: (output_path.parent / f"namespace={ns}") / output_path.name
|
||||
for ns in namespaces
|
||||
}
|
||||
for path in ns_paths.values():
|
||||
Path(path).parent.mkdir(exist_ok=True, parents=True)
|
||||
pq_writers = {
|
||||
ns: pq.ParquetWriter(
|
||||
path, schema, flavor="spark", sorting_columns=sorting_cols
|
||||
)
|
||||
for ns, path in ns_paths.items()
|
||||
}
|
||||
# Writers are created lazily when first needed to avoid empty files on early exit
|
||||
pq_writers = {}
|
||||
writer = None # Not used for partitioned output
|
||||
|
||||
else:
|
||||
writer = pacsv.CSVWriter(
|
||||
@@ -510,6 +534,9 @@ class WikiqParser:
|
||||
schema,
|
||||
write_options=pacsv.WriteOptions(delimiter="\t"),
|
||||
)
|
||||
ns_paths = {}
|
||||
pq_writers = {}
|
||||
sorting_cols = None
|
||||
|
||||
regex_matches = {}
|
||||
|
||||
@@ -522,6 +549,42 @@ class WikiqParser:
|
||||
if page.mwpage.namespace not in self.namespace_filter:
|
||||
continue
|
||||
|
||||
# Resume logic: skip pages that come before the resume point.
|
||||
# For partitioned output, each namespace has its own resume point.
|
||||
is_resume_page = False
|
||||
page_resume_point = None
|
||||
if self.resume_point is not None:
|
||||
page_id = page.mwpage.id
|
||||
page_ns = page.mwpage.namespace
|
||||
|
||||
if self.partition_namespaces:
|
||||
# Per-namespace resume: check if we've passed this namespace's resume point
|
||||
if found_resume_point.get(page_ns, False):
|
||||
pass # Already past resume point for this namespace
|
||||
elif page_ns not in self.resume_point:
|
||||
# No resume point for this namespace, process normally
|
||||
found_resume_point[page_ns] = True
|
||||
else:
|
||||
resume_pageid, resume_revid = self.resume_point[page_ns]
|
||||
if page_id < resume_pageid:
|
||||
continue
|
||||
elif page_id == resume_pageid:
|
||||
is_resume_page = True
|
||||
page_resume_point = (resume_pageid, resume_revid)
|
||||
else:
|
||||
found_resume_point[page_ns] = True
|
||||
else:
|
||||
# Single-file resume: global resume point
|
||||
if not found_resume_point:
|
||||
resume_pageid, resume_revid = self.resume_point
|
||||
if page_id < resume_pageid:
|
||||
continue
|
||||
elif page_id == resume_pageid:
|
||||
is_resume_page = True
|
||||
page_resume_point = (resume_pageid, resume_revid)
|
||||
else:
|
||||
found_resume_point = True
|
||||
|
||||
# Disable detecting reverts if radius is 0.
|
||||
if self.revert_radius > 0:
|
||||
reverts_column.rev_detector = mwreverts.Detector(
|
||||
@@ -602,28 +665,6 @@ class WikiqParser:
|
||||
|
||||
n_revs = 0
|
||||
|
||||
# If we're resuming and haven't found the resume point yet, check this batch
|
||||
skip_batch = False
|
||||
if not found_resume_point and self.resume_from_revid is not None:
|
||||
batch_has_resume_point = False
|
||||
for revs in batch:
|
||||
revs_list = list(revs)
|
||||
for rev in revs_list:
|
||||
if rev.id == self.resume_from_revid:
|
||||
batch_has_resume_point = True
|
||||
found_resume_point = True
|
||||
print(f"Found resume point at revid {self.resume_from_revid}", file=sys.stderr)
|
||||
break
|
||||
if batch_has_resume_point:
|
||||
break
|
||||
|
||||
# If this batch doesn't contain the resume point, skip it entirely
|
||||
if not batch_has_resume_point:
|
||||
skip_batch = True
|
||||
|
||||
if skip_batch:
|
||||
continue
|
||||
|
||||
for revs in batch:
|
||||
# Revisions may or may not be grouped into lists of contiguous revisions by the
|
||||
# same user. We call these "edit sessions". Otherwise revs is a list containing
|
||||
@@ -650,6 +691,15 @@ class WikiqParser:
|
||||
regex_matches[k] = []
|
||||
regex_matches[k].append(v)
|
||||
|
||||
# Check for shutdown after each revision
|
||||
if self.shutdown_requested:
|
||||
break
|
||||
|
||||
# If shutdown requested, skip all remaining processing and close writers
|
||||
if self.shutdown_requested:
|
||||
print("Shutdown requested, closing writers...", file=sys.stderr)
|
||||
break
|
||||
|
||||
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
|
||||
batch_row_buffer = table.pop()
|
||||
if self.persist != PersistMethod.none:
|
||||
@@ -790,31 +840,39 @@ class WikiqParser:
|
||||
if not self.text and self.persist != PersistMethod.none:
|
||||
del row_buffer["text"]
|
||||
|
||||
# If we just found the resume point in this batch, filter to only write revisions after it
|
||||
if self.resume_from_revid is not None:
|
||||
# Filter for resume logic if on resume page
|
||||
should_write = True
|
||||
if is_resume_page:
|
||||
_, resume_revid = page_resume_point
|
||||
revids = row_buffer["revid"]
|
||||
# Find the index of the resume revid
|
||||
resume_idx = None
|
||||
for idx, revid in enumerate(revids):
|
||||
if revid == self.resume_from_revid:
|
||||
resume_idx = idx
|
||||
break
|
||||
resume_idx = next((i for i, r in enumerate(revids) if r == resume_revid), None)
|
||||
|
||||
if resume_idx is not None:
|
||||
# Mark resume point as found
|
||||
if self.partition_namespaces:
|
||||
found_resume_point[page.mwpage.namespace] = True
|
||||
else:
|
||||
found_resume_point = True
|
||||
is_resume_page = False
|
||||
|
||||
# Only write revisions after the resume point
|
||||
if resume_idx + 1 < len(revids):
|
||||
row_buffer = {k: v[resume_idx + 1:] for k, v in row_buffer.items()}
|
||||
print(f"Resuming output starting at revid {row_buffer['revid'][0]}", file=sys.stderr)
|
||||
else:
|
||||
# The resume point was the last revision in this batch, skip writing
|
||||
continue
|
||||
should_write = False
|
||||
else:
|
||||
should_write = False
|
||||
|
||||
# Only write if there are rows to write
|
||||
if len(row_buffer.get("revid", [])) > 0:
|
||||
if self.partition_namespaces is True:
|
||||
writer = pq_writers[page.mwpage.namespace]
|
||||
writer.write(pa.record_batch(row_buffer, schema=schema))
|
||||
# Write batch if there are rows
|
||||
if should_write and len(row_buffer.get("revid", [])) > 0:
|
||||
namespace = page.mwpage.namespace if self.partition_namespaces else None
|
||||
self._write_batch(row_buffer, schema, writer, pq_writers, ns_paths, sorting_cols, namespace)
|
||||
gc.collect()
|
||||
|
||||
# If shutdown was requested, break from page loop
|
||||
if self.shutdown_requested:
|
||||
break
|
||||
page_count += 1
|
||||
|
||||
print(
|
||||
@@ -829,79 +887,12 @@ class WikiqParser:
|
||||
|
||||
# If we were resuming, merge the original file with the new temp file
|
||||
if original_output_file is not None and temp_output_file is not None:
|
||||
print("Merging resumed data with existing output...", file=sys.stderr)
|
||||
try:
|
||||
# Check if we're merging partitioned namespaces or single files
|
||||
if os.path.isdir(original_output_file):
|
||||
# Merge partitioned namespace directories
|
||||
self._merge_partitioned_namespaces(original_output_file, temp_output_file)
|
||||
else:
|
||||
# Merge single parquet files
|
||||
merged_output_file = original_output_file + ".merged"
|
||||
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
|
||||
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_output_file)
|
||||
os.rename(merged_output_file, original_output_file)
|
||||
|
||||
# Clean up the temp file/directory
|
||||
if os.path.exists(temp_output_file):
|
||||
if os.path.isdir(temp_output_file):
|
||||
import shutil
|
||||
shutil.rmtree(temp_output_file)
|
||||
else:
|
||||
os.remove(temp_output_file)
|
||||
|
||||
print("Merge complete.", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Error merging resume data: {e}", file=sys.stderr)
|
||||
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
|
||||
raise
|
||||
|
||||
def _merge_partitioned_namespaces(self, original_output_dir, temp_output_dir):
|
||||
"""
|
||||
Merge partitioned namespace directories.
|
||||
For each namespace partition in the temp directory, merge its parquet files with the original.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
# Get all namespace directories from temp
|
||||
temp_namespace_dirs = [d for d in os.listdir(temp_output_dir) if d.startswith('namespace=')]
|
||||
|
||||
for ns_dir in temp_namespace_dirs:
|
||||
temp_ns_path = os.path.join(temp_output_dir, ns_dir)
|
||||
original_ns_path = os.path.join(original_output_dir, ns_dir)
|
||||
|
||||
# Find parquet files in the temp namespace directory
|
||||
temp_parquet_files = [f for f in os.listdir(temp_ns_path) if f.endswith('.parquet')]
|
||||
|
||||
if not temp_parquet_files:
|
||||
continue
|
||||
|
||||
temp_parquet_path = os.path.join(temp_ns_path, temp_parquet_files[0])
|
||||
|
||||
# Check if the namespace partition exists in the original directory
|
||||
if os.path.exists(original_ns_path):
|
||||
# Namespace partition exists, merge the files
|
||||
original_parquet_files = [f for f in os.listdir(original_ns_path) if f.endswith('.parquet')]
|
||||
|
||||
if not original_parquet_files:
|
||||
# No parquet file in original, just copy the temp file
|
||||
shutil.copy(temp_parquet_path, os.path.join(original_ns_path, temp_parquet_files[0]))
|
||||
else:
|
||||
original_parquet_path = os.path.join(original_ns_path, original_parquet_files[0])
|
||||
merged_parquet_path = original_parquet_path + ".merged"
|
||||
|
||||
# Merge the files
|
||||
merge_parquet_files(original_parquet_path, temp_parquet_path, merged_parquet_path)
|
||||
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_parquet_path)
|
||||
os.rename(merged_parquet_path, original_parquet_path)
|
||||
else:
|
||||
# Namespace partition doesn't exist in original, create it
|
||||
shutil.copytree(temp_ns_path, original_ns_path)
|
||||
|
||||
finalize_resume_merge(
|
||||
original_output_file,
|
||||
temp_output_file,
|
||||
self.partition_namespaces,
|
||||
original_partition_dir
|
||||
)
|
||||
|
||||
def match_archive_suffix(input_filename):
|
||||
if re.match(r".*\.7z$", input_filename):
|
||||
@@ -942,111 +933,6 @@ def open_output_file(input_filename):
|
||||
return output_file
|
||||
|
||||
|
||||
def merge_parquet_files(original_path, temp_path, merged_path):
|
||||
"""
|
||||
Merge two parquet files by copying all row groups from original and temp into merged.
|
||||
"""
|
||||
original_pq = pq.ParquetFile(original_path)
|
||||
temp_pq = pq.ParquetFile(temp_path)
|
||||
|
||||
merged_writer = None
|
||||
|
||||
# Copy all row groups from the original file
|
||||
for i in range(original_pq.num_row_groups):
|
||||
row_group = original_pq.read_row_group(i)
|
||||
if merged_writer is None:
|
||||
merged_writer = pq.ParquetWriter(
|
||||
merged_path,
|
||||
row_group.schema,
|
||||
flavor="spark"
|
||||
)
|
||||
merged_writer.write_table(row_group)
|
||||
|
||||
# Append all row groups from the temp file
|
||||
for i in range(temp_pq.num_row_groups):
|
||||
row_group = temp_pq.read_row_group(i)
|
||||
merged_writer.write_table(row_group)
|
||||
|
||||
# Close the writer
|
||||
if merged_writer is not None:
|
||||
merged_writer.close()
|
||||
|
||||
|
||||
def get_last_revid_from_parquet(output_file):
|
||||
"""
|
||||
Read the last revid from a parquet file or partitioned namespace directory.
|
||||
Returns None if the file doesn't exist or is empty.
|
||||
Handles both single files and partitioned namespace structures (namespace=*/file.parquet).
|
||||
For partitioned namespaces, finds the most recently modified partition and reads from it.
|
||||
"""
|
||||
try:
|
||||
if not os.path.exists(output_file):
|
||||
return None
|
||||
|
||||
# Check if this is a partitioned namespace directory
|
||||
if os.path.isdir(output_file):
|
||||
# Find all namespace=* subdirectories
|
||||
namespace_dirs = [d for d in os.listdir(output_file) if d.startswith('namespace=')]
|
||||
|
||||
if not namespace_dirs:
|
||||
return None
|
||||
|
||||
# Find the most recently modified namespace partition
|
||||
most_recent_ns = None
|
||||
most_recent_mtime = -1
|
||||
|
||||
for ns_dir in namespace_dirs:
|
||||
ns_path = os.path.join(output_file, ns_dir)
|
||||
mtime = os.path.getmtime(ns_path)
|
||||
if mtime > most_recent_mtime:
|
||||
most_recent_mtime = mtime
|
||||
most_recent_ns = ns_path
|
||||
|
||||
if most_recent_ns is None:
|
||||
return None
|
||||
|
||||
# Find the parquet file in the most recent namespace directory
|
||||
parquet_files = [f for f in os.listdir(most_recent_ns) if f.endswith('.parquet')]
|
||||
|
||||
if not parquet_files:
|
||||
return None
|
||||
|
||||
parquet_path = os.path.join(most_recent_ns, parquet_files[0])
|
||||
parquet_file = pq.ParquetFile(parquet_path)
|
||||
|
||||
if parquet_file.num_row_groups == 0:
|
||||
return None
|
||||
|
||||
# Read only the last row group, and only the revid column
|
||||
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
|
||||
|
||||
if last_row_group.num_rows == 0:
|
||||
return None
|
||||
|
||||
# Get the last revid from this row group
|
||||
last_revid = last_row_group.column('revid')[-1].as_py()
|
||||
return last_revid
|
||||
else:
|
||||
# Single parquet file
|
||||
parquet_file = pq.ParquetFile(output_file)
|
||||
|
||||
if parquet_file.num_row_groups == 0:
|
||||
return None
|
||||
|
||||
# Read only the last row group, and only the revid column
|
||||
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
|
||||
|
||||
if last_row_group.num_rows == 0:
|
||||
return None
|
||||
|
||||
# Get the last revid from this row group
|
||||
last_revid = last_row_group.column('revid')[-1].as_py()
|
||||
return last_revid
|
||||
except Exception as e:
|
||||
print(f"Error reading last revid from {output_file}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse MediaWiki XML database dumps into tab delimited data."
|
||||
@@ -1291,16 +1177,29 @@ def main():
|
||||
output_file = output
|
||||
|
||||
# Handle resume functionality
|
||||
resume_from_revid = None
|
||||
resume_point = None
|
||||
if args.resume:
|
||||
if output_parquet and not args.stdout:
|
||||
resume_from_revid = get_last_revid_from_parquet(output_file)
|
||||
if resume_from_revid is not None:
|
||||
print(f"Resuming from last written revid: {resume_from_revid}", file=sys.stderr)
|
||||
resume_point = get_resume_point(output_file, args.partition_namespaces)
|
||||
if resume_point is not None:
|
||||
if args.partition_namespaces:
|
||||
# Dict mapping namespace -> (pageid, revid)
|
||||
ns_list = sorted(resume_point.keys())
|
||||
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
|
||||
for ns in ns_list:
|
||||
pageid, revid = resume_point[ns]
|
||||
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
else:
|
||||
pageid, revid = resume_point
|
||||
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
else:
|
||||
print("Resume requested but no existing output file found, starting from beginning", file=sys.stderr)
|
||||
if args.partition_namespaces:
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
|
||||
else:
|
||||
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
|
||||
else:
|
||||
print("Warning: --resume only works with parquet output (not stdout or TSV)", file=sys.stderr)
|
||||
sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")
|
||||
|
||||
wikiq = WikiqParser(
|
||||
input_file,
|
||||
@@ -1318,7 +1217,7 @@ def main():
|
||||
output_parquet=output_parquet,
|
||||
partition_namespaces=args.partition_namespaces,
|
||||
batch_size=args.batch_size,
|
||||
resume_from_revid=resume_from_revid,
|
||||
resume_point=resume_point,
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
@@ -1326,7 +1225,23 @@ def main():
|
||||
headings=args.headings,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
# Register signal handlers for graceful shutdown (CLI only)
|
||||
def handle_shutdown(signum, frame):
|
||||
sig_name = signal.Signals(signum).name
|
||||
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
|
||||
wikiq.request_shutdown()
|
||||
|
||||
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
|
||||
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
|
||||
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
|
||||
|
||||
try:
|
||||
wikiq.process()
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGUSR1, original_sigusr1)
|
||||
|
||||
# close things
|
||||
input_file.close()
|
||||
@@ -1350,7 +1265,7 @@ def main():
|
||||
diff=args.diff,
|
||||
text=args.text,
|
||||
batch_size=args.batch_size,
|
||||
resume_from_revid=None,
|
||||
resume_point=None,
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
@@ -1358,7 +1273,23 @@ def main():
|
||||
headings=args.headings,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
# Register signal handlers for graceful shutdown (CLI only)
|
||||
def handle_shutdown(signum, frame):
|
||||
sig_name = signal.Signals(signum).name
|
||||
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
|
||||
wikiq.request_shutdown()
|
||||
|
||||
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
|
||||
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
|
||||
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
|
||||
|
||||
try:
|
||||
wikiq.process()
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGUSR1, original_sigusr1)
|
||||
|
||||
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
|
||||
# stop_words = stop_words.split(",")
|
||||
|
||||
Reference in New Issue
Block a user