mediawiki_dump_tools/src/wikiq/__init__.py

#!/usr/bin/env python3

# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import argparse
import gc
import json
import os.path
import re
import signal
import sys
import threading
import time
from collections import deque, defaultdict
from hashlib import sha1
from io import TextIOWrapper
from itertools import groupby
from subprocess import PIPE, Popen
from typing import IO, Any, Generator, TextIO, Union

import mwpersistence
import mwreverts
import mwxml
import pywikidiff2
from deltas.tokenizers import wikitext_split
from more_itertools import peekable
from mwxml import Dump
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
from wikiq.wikitext_parser import WikitextParser
from wikiq.resume import (
    get_checkpoint_path,
    read_checkpoint,
    get_resume_point,
    setup_resume_temp_output,
    finalize_resume_merge,
    cleanup_interrupted_resume,
)

TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
DIFF_TIMEOUT_MS = 60000
from pathlib import Path

import pyarrow as pa
import pyarrow.csv as pacsv
import pyarrow.parquet as pq
from deltas import SegmentMatcher, SequenceMatcher


def pyarrow_type_to_spark(pa_type):
    """Convert a PyArrow type to Spark JSON schema format."""
    if pa.types.is_int64(pa_type):
        return "long"
    elif pa.types.is_int32(pa_type):
        return "integer"
    elif pa.types.is_int8(pa_type):
        return "byte"
    elif pa.types.is_boolean(pa_type):
        return "boolean"
    elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
        return "string"
    elif pa.types.is_timestamp(pa_type):
        return "timestamp"
    elif pa.types.is_list(pa_type):
        return {
            "type": "array",
            "elementType": pyarrow_type_to_spark(pa_type.value_type),
            "containsNull": True
        }
    elif pa.types.is_struct(pa_type):
        return {
            "type": "struct",
            "fields": [
                {
                    "name": field.name,
                    "type": pyarrow_type_to_spark(field.type),
                    "nullable": field.nullable,
                    "metadata": {}
                }
                for field in pa_type
            ]
        }
    elif pa.types.is_map(pa_type):
        return {
            "type": "map",
            "keyType": pyarrow_type_to_spark(pa_type.key_type),
            "valueType": pyarrow_type_to_spark(pa_type.item_type),
            "valueContainsNull": True
        }
    else:
        return "string"


def pyarrow_to_spark_schema(schema: pa.Schema) -> dict:
    """Convert a PyArrow schema to Spark JSON schema format."""
    return {
        "type": "struct",
        "fields": [
            {
                "name": field.name,
                "type": pyarrow_type_to_spark(field.type),
                "nullable": field.nullable,
                "metadata": {}
            }
            for field in schema
        ]
    }


def build_table(
    text: bool = False,
    collapse_user: bool = False,
    external_links: bool = False,
    citations: bool = False,
    wikilinks: bool = False,
    templates: bool = False,
    headings: bool = False,
):
    """Build the RevisionTable with appropriate columns based on flags.

    Returns:
        (table, reverts_column) - the table and a reference to the reverts column
        (which process() needs for setting the revert detector).
    """
    reverts_column = tables.RevisionReverts()

    table = RevisionTable([
        tables.RevisionId(),
        tables.RevisionTimestamp(),
        tables.RevisionArticleId(),
        tables.RevisionPageTitle(),
        tables.RevisionNamespace(),
        tables.RevisionDeleted(),
        tables.RevisionEditorId(),
        tables.RevisionEditSummary(),
        tables.RevisionTextChars(),
        reverts_column,
        tables.RevisionSha1(),
        tables.RevisionIsMinor(),
        tables.RevisionEditorText(),
        tables.RevisionIsAnon(),
    ])

    if text:
        table.columns.append(tables.RevisionText())

    if collapse_user:
        table.columns.append(tables.RevisionCollapsed())

    if external_links or citations or wikilinks or templates or headings:
        wikitext_parser = WikitextParser()

        if external_links:
            table.columns.append(tables.RevisionExternalLinks(wikitext_parser))

        if citations:
            table.columns.append(tables.RevisionCitations(wikitext_parser))

        if wikilinks:
            table.columns.append(tables.RevisionWikilinks(wikitext_parser))

        if templates:
            table.columns.append(tables.RevisionTemplates(wikitext_parser))

        if headings:
            table.columns.append(tables.RevisionHeadings(wikitext_parser))

        table.columns.append(tables.RevisionParserTimeout(wikitext_parser))

    return table, reverts_column


def build_schema(
    table,
    diff: bool = False,
    persist: int = 0,
    text: bool = False,
    regex_revision_pairs: list = None,
    regex_comment_pairs: list = None,
) -> pa.Schema:
    """Build the PyArrow schema from a table, adding output-only fields."""
    schema = table.schema()
    schema = schema.append(pa.field("revert", pa.bool_(), nullable=True))

    if diff:
        from wikiq.diff_pyarrow_schema import diff_field
        schema = schema.append(diff_field)
        schema = schema.append(pa.field("diff_timeout", pa.bool_()))

    if regex_revision_pairs:
        for pair in regex_revision_pairs:
            for field in pair.get_pyarrow_fields():
                schema = schema.append(field)

    if regex_comment_pairs:
        for pair in regex_comment_pairs:
            for field in pair.get_pyarrow_fields():
                schema = schema.append(field)

    if persist != PersistMethod.none:
        # RevisionText is added to the table for extraction, but not to schema
        # (unless text=True, in which case it's already in the schema from build_table)
        schema = schema.append(pa.field("token_revs", pa.int64(), nullable=True))
        schema = schema.append(pa.field("tokens_added", pa.int64(), nullable=True))
        schema = schema.append(pa.field("tokens_removed", pa.int64(), nullable=True))
        schema = schema.append(pa.field("tokens_window", pa.int64(), nullable=True))

    return schema


def make_regex_pairs(patterns, labels) -> list:
    """Create RegexPair objects from patterns and labels."""
    if (patterns is not None and labels is not None) and (len(patterns) == len(labels)):
        return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
    elif patterns is None and labels is None:
        return []
    else:
        sys.exit("Each regular expression *must* come with a corresponding label and vice versa.")


class JSONLWriter:
    """Write JSONL output with schema validation."""

    def __init__(self, output_file: str, schema: pa.Schema, append: bool = False):
        self.output_file = output_file
        self.schema = schema
        self.field_names = [field.name for field in schema]

        if append and os.path.exists(output_file):
            self._validate_and_fix_last_line(output_file)

        mode = "a" if append else "w"
        self._file = open(output_file, mode)

    def _validate_and_fix_last_line(self, filepath: str):
        """Validate the last line of JSONL file; truncate if corrupted.

        If the previous run was interrupted mid-write, the last line may be
        incomplete JSON. This detects and removes such corrupted lines.
        """
        with open(filepath, 'rb') as f:
            f.seek(0, 2)
            file_size = f.tell()
            if file_size == 0:
                return

            # Read backwards to find the last newline
            chunk_size = min(8192, file_size)
            f.seek(-chunk_size, 2)
            chunk = f.read(chunk_size)

            # Find the last complete line
            last_newline = chunk.rfind(b'\n')
            if last_newline == -1:
                # Entire file is one line (possibly corrupted)
                last_line = chunk.decode('utf-8', errors='replace')
                truncate_pos = 0
            else:
                last_line = chunk[last_newline + 1:].decode('utf-8', errors='replace')
                truncate_pos = file_size - chunk_size + last_newline + 1

            # If last line is empty, file ends with newline - that's fine
            if not last_line.strip():
                return

            # Try to parse the last line as JSON
            try:
                json.loads(last_line)
            except json.JSONDecodeError:
                print(f"Warning: Last line of {filepath} is corrupted JSON, removing it",
                      file=sys.stderr)
                # Truncate the file to remove the corrupted last line
                with open(filepath, 'r+b') as f:
                    f.truncate(truncate_pos)

    def write_batch(self, data: dict):
        """Write a batch of rows as JSONL.

        Args:
            data: dict mapping column names to lists of values
        """
        if not data or not data.get(self.field_names[0]):
            return

        num_rows = len(data[self.field_names[0]])
        for i in range(num_rows):
            row = {}
            for name in self.field_names:
                if name in data:
                    value = data[name][i]
                    row[name] = self._convert_value(value)
            self._file.write(json.dumps(row) + "\n")

    def _convert_value(self, value):
        """Convert a value to JSON-serializable format."""
        if value is None:
            return None
        elif isinstance(value, (str, int, float, bool)):
            return value
        elif hasattr(value, "isoformat"):
            return value.isoformat()
        elif isinstance(value, (list, tuple)):
            return [self._convert_value(v) for v in value]
        elif isinstance(value, dict):
            return {k: self._convert_value(v) for k, v in value.items()}
        else:
            return str(value)

    def close(self):
        self._file.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False


class PersistMethod:
    none = 0
    sequence = 1
    segment = 2
    legacy = 3
    wikidiff2 = 4


def diff_with_timeout(differ, last_text, text):
    """Returns (result, timed_out) tuple using native pywikidiff2 timeout."""
    result = differ.inline_json_diff(last_text, text, timeout_ms=DIFF_TIMEOUT_MS)
    return result, differ.timed_out()


def calculate_persistence(tokens_added):
    return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))


def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]:
    i = 0
    for rev in revs:
        if rev.text is None:
            rev.text = ""
        if not rev.sha1 and not rev.deleted.text:
            rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
        revs[i] = rev
        i += 1
    return revs


class WikiqIterator:
    def __init__(self, fh, collapse_user=False):
        self.fh = fh
        self.collapse_user = collapse_user
        self.mwiterator = Dump.from_file(self.fh)
        self.namespace_map = {
            ns.id: ns.name for ns in self.mwiterator.site_info.namespaces
        }
        self.__pages: Generator[WikiqPage] = self.load_pages()

    def load_pages(self):
        for page in self.mwiterator:
            yield WikiqPage(
                page, namespace_map=self.namespace_map, collapse_user=self.collapse_user
            )

    def __iter__(self):
        return self.__pages

    def __next__(self):
        return next(self.__pages)


class WikiqPage:
    __slots__ = (
        "id",
        "redirect",
        "restrictions",
        "mwpage",
        "__revisions",
        "collapse_user",
    )

    def __init__(self, page, namespace_map, collapse_user=False):
        self.id = page.id
        # following mwxml, we assume namespace 0 in cases where
        # page.namespace is inconsistent with namespace_map
        if page.namespace not in namespace_map:
            page.namespace = 0
        if page.namespace != 0:
            page.title = ":".join([namespace_map[page.namespace], page.title])
        self.restrictions = page.restrictions
        self.collapse_user = collapse_user
        self.mwpage = page
        self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list()

    @staticmethod
    def user_text(rev) -> Union[str, None]:
        return None if rev.deleted.user else rev.user.text

    def rev_list(self):
        # Outline for how we want to handle collapse_user=True
        # iteration   rev.user   prev_rev.user   add prev_rev?
        #         0          A            None           Never
        #         1          A               A           False
        #         2          B               A            True
        #         3          A               B            True
        #         4          A               A           False
        # Post-loop                          A          Always

        if not self.collapse_user:
            for rev in self.mwpage:
                yield [rev]
            return

        for _, revs in groupby(self.mwpage, self.user_text):
            # All revisions are either from the same user, or this is a single
            # revision where the user is missing.
            yield list(revs)

    def __iter__(self):
        return self.__revisions

    def __next__(self):
        return next(self.__revisions)


"""
A RegexPair is defined by a regular expression (pattern) and a label.
The pattern can include capture groups.  If it does then each capture group will have a resulting column in the output.
If the pattern does not include a capture group, then only one output column will result.
"""


class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
        self.label = label
        self.has_groups = bool(self.pattern.groupindex)
        if self.has_groups:
            self.capture_groups = list(self.pattern.groupindex.keys())

    def get_pyarrow_fields(self):
        if self.has_groups:
            fields = [
                pa.field(self._make_key(cap_group), pa.string())
                for cap_group in self.capture_groups
            ]
        else:
            fields = [pa.field(self.label, pa.string())]

        return fields

    def _make_key(self, cap_group):
        return "{}_{}".format(self.label, cap_group)

    def matchmake(self, content: str) -> dict:
        temp_dict = {}
        # if there are named capture groups in the regex
        if self.has_groups:
            # if there are matches of some sort in this revision content, fill the lists for each cap_group
            if self.pattern.search(content) is not None:
                m = self.pattern.finditer(content)
                matchobjects = list(m)

                for cap_group in self.capture_groups:
                    key = self._make_key(cap_group)
                    temp_list = []
                    for match in matchobjects:
                        # we only want to add the match for the capture group if the match is not None
                        if match.group(cap_group) is not None:
                            temp_list.append(match.group(cap_group))

                    # if temp_list of matches is empty just make that column None
                    if len(temp_list) == 0:
                        temp_dict[key] = None
                    # else we put in the list we made in the for-loop above
                    else:
                        temp_dict[key] = ", ".join(temp_list)

            # there are no matches at all in this revision content, we default values to None
            else:
                for cap_group in self.capture_groups:
                    key = self._make_key(cap_group)
                    temp_dict[key] = None

        # there are no capture groups, we just search for all the matches of the regex
        else:
            # given that there are matches to be made
            if type(content) in (str, bytes):
                if self.pattern.search(content) is not None:
                    m = self.pattern.findall(content)
                    temp_dict[self.label] = ", ".join(m)
                else:
                    temp_dict[self.label] = None

        return temp_dict


class WikiqParser:
    def __init__(
        self,
        input_file: Union[TextIOWrapper, IO[Any], IO[bytes]],
        output_file: Union[TextIO, str],
        regex_match_revision: list[str],
        regex_match_comment: list[str],
        regex_revision_label: list[str],
        regex_comment_label: list[str],
        text: bool = False,
        diff: bool = False,
        collapse_user: bool = False,
        persist: int = None,
        namespaces: Union[list[int], None] = None,
        revert_radius: int = 15,
        output_jsonl: bool = False,
        output_jsonl_dir: bool = False,
        output_parquet: bool = False,
        batch_size: int = 1024,
        resume_point: Union[tuple, dict, None] = None,
        partition_namespaces: bool = False,
        external_links: bool = False,
        citations: bool = False,
        wikilinks: bool = False,
        templates: bool = False,
        headings: bool = False,
        time_limit_seconds: Union[float, None] = None,
        max_revisions_per_file: int = 0,
    ):
        """
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
           resume_point : if set, either a (pageid, revid) tuple for single-file output,
                          or a dict mapping namespace -> (pageid, revid) for partitioned output.
                          For single-file: skip all revisions up to and including this point.
           max_revisions_per_file : if > 0, close and rotate output files after this many revisions
        """
        self.input_file = input_file

        self.collapse_user: bool = collapse_user
        self.persist: int = persist
        self.namespaces = []
        self.revert_radius = revert_radius
        self.diff = diff
        self.text = text
        self.partition_namespaces = partition_namespaces
        self.resume_point = resume_point
        self.external_links = external_links
        self.citations = citations
        self.wikilinks = wikilinks
        self.templates = templates
        self.headings = headings
        self.shutdown_requested = False
        self.time_limit_seconds = time_limit_seconds
        self.max_revisions_per_file = max_revisions_per_file
        if namespaces is not None:
            self.namespace_filter = set(namespaces)
        else:
            self.namespace_filter = None

        self.regex_schemas = []
        self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(
            regex_match_revision, regex_revision_label
        )
        self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(
            regex_match_comment, regex_comment_label
        )

        # Initialize output
        self.batch_size = batch_size
        self.output_jsonl = output_jsonl
        self.output_jsonl_dir = output_jsonl_dir
        self.output_parquet = output_parquet
        self.output_file = output_file

        if output_parquet:
            self.pq_writer = None
            self.parquet_buffer = []
        elif output_jsonl:
            pass  # JSONLWriter created in process()
        else:
            # TSV output
            self.print_header = True
            if output_file == sys.stdout.buffer:
                pass
            else:
                self.output_file = open(output_file, "wb")

        # Checkpoint for tracking resume point (path only, no open file handle for NFS safety)
        self.checkpoint_path = None
        self.checkpoint_state = {}  # namespace -> (pageid, revid) or None -> (pageid, revid)

    def request_shutdown(self):
        """Request graceful shutdown. The process() method will exit after completing the current batch."""
        self.shutdown_requested = True

    def _time_limit_expired(self):
        """Timer callback when time limit is reached."""
        hours = self.time_limit_seconds / 3600
        print(f"Time limit of {hours:.2f} hours reached, requesting shutdown...", file=sys.stderr)
        self.request_shutdown()

    def _start_time_limit_timer(self):
        """Start a background timer to trigger shutdown when time limit is reached."""
        if self.time_limit_seconds is None:
            return None
        timer = threading.Timer(self.time_limit_seconds, self._time_limit_expired)
        timer.daemon = True
        timer.start()
        return timer

    def _cancel_time_limit_timer(self, timer):
        """Cancel the time limit timer if it's still running."""
        if timer is not None:
            timer.cancel()

    def _get_part_path(self, base_path, part_num):
        """Generate path with part number inserted before extension.

        Example: output.parquet -> output.part0.parquet
        """
        path = Path(base_path)
        return path.parent / f"{path.stem}.part{part_num}{path.suffix}"

    def _open_checkpoint(self, output_file):
        """Enable checkpointing for Parquet output only.

        JSONL doesn't need checkpoint files - resume point is derived from last line.
        """
        if not self.output_parquet or output_file == sys.stdout.buffer:
            return
        self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
        Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
        print(f"Checkpoint enabled: {self.checkpoint_path}", file=sys.stderr)

    def _update_checkpoint(self, pageid, revid, namespace=None, part=0):
        """Update checkpoint state and write atomically (NFS-safe)."""
        if self.checkpoint_path is None:
            return
        if self.partition_namespaces:
            self.checkpoint_state[namespace] = {"pageid": pageid, "revid": revid, "part": part}
        else:
            self.checkpoint_state = {"pageid": pageid, "revid": revid, "part": part}
        # Atomic write: write to temp file, then rename
        temp_path = self.checkpoint_path + ".tmp"
        with open(temp_path, 'w') as f:
            json.dump(self.checkpoint_state, f)
        os.replace(temp_path, self.checkpoint_path)

    def _close_checkpoint(self, delete=False):
        """Clean up checkpoint, optionally deleting it."""
        if self.checkpoint_path is None:
            return
        if delete and os.path.exists(self.checkpoint_path):
            os.remove(self.checkpoint_path)
            print(f"Checkpoint deleted: {self.checkpoint_path}", file=sys.stderr)
        elif os.path.exists(self.checkpoint_path):
            print(f"Checkpoint preserved for resume: {self.checkpoint_path}", file=sys.stderr)
        # Clean up any leftover temp file
        temp_path = self.checkpoint_path + ".tmp"
        if os.path.exists(temp_path):
            os.remove(temp_path)

    def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_base_paths, sorting_cols, namespace=None, part_numbers=None):
        """Write a batch of rows to the appropriate writer.

        For partitioned output, creates writer lazily if needed.
        Returns (writer, num_rows) - writer used and number of rows written.
        """
        num_rows = len(row_buffer.get("revid", []))
        if self.partition_namespaces and namespace is not None:
            if namespace not in pq_writers:
                base_path = ns_base_paths[namespace]
                part_num = part_numbers.get(namespace, 0) if part_numbers else 0
                if self.max_revisions_per_file > 0:
                    ns_path = self._get_part_path(base_path, part_num)
                else:
                    ns_path = base_path
                Path(ns_path).parent.mkdir(exist_ok=True, parents=True)
                pq_writers[namespace] = pq.ParquetWriter(
                    ns_path, schema, flavor="spark", sorting_columns=sorting_cols
                )
            writer = pq_writers[namespace]
        writer.write(pa.record_batch(row_buffer, schema=schema))
        return writer, num_rows

    def make_matchmake_pairs(self, patterns, labels) -> list[RegexPair]:
        if (patterns is not None and labels is not None) and (
            len(patterns) == len(labels)
        ):
            result: list[RegexPair] = []
            for pattern, label in zip(patterns, labels):
                rp = RegexPair(pattern, label)
                result.append(rp)
                self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
            return result
        elif (patterns is None) and (labels is None):
            return []
        else:
            sys.exit(
                "Each regular expression *must* come with a corresponding label and vice versa."
            )

    def matchmake_revision(self, rev: mwxml.Revision):
        result = self.matchmake_text(rev.text)
        for k, v in self.matchmake_comment(rev.comment).items():
            result[k] = v
        return result

    def matchmake_text(self, text: str):
        return self.matchmake_pairs(text, self.regex_revision_pairs)

    def matchmake_comment(self, comment: str):
        return self.matchmake_pairs(comment, self.regex_comment_pairs)

    @staticmethod
    def matchmake_pairs(text, pairs):
        result = {}
        for pair in pairs:
            for k, v in pair.matchmake(text).items():
                result[k] = v
        return result

    def __get_namespace_from_title(self, title):
        default_ns = None

        for ns in self.namespaces:
            # skip if the namespace is not defined
            if ns is None:
                default_ns = self.namespaces[ns]
                continue

            if title.startswith(ns + ":"):
                return self.namespaces[ns]

        # if we've made it this far with no matches, we return the default namespace
        return default_ns

    def process(self):
        # Start time limit timer if configured
        time_limit_timer = self._start_time_limit_timer()

        # Track whether we've passed the resume point
        if self.resume_point is None:
            found_resume_point = True
        elif self.partition_namespaces:
            found_resume_point = {}
        else:
            found_resume_point = False

        # When resuming with parquet, write new data to temp file/directory and merge at the end
        original_output_file = None
        temp_output_file = None
        original_partition_dir = None
        if self.resume_point is not None and self.output_parquet:
            original_output_file, temp_output_file, original_partition_dir = \
                setup_resume_temp_output(self.output_file, self.partition_namespaces)
            if temp_output_file is not None:
                self.output_file = temp_output_file

        # Open checkpoint file for tracking resume point
        checkpoint_output = original_output_file if original_output_file else self.output_file
        self._open_checkpoint(checkpoint_output)

        # Construct dump file iterator
        dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)

        table, reverts_column = build_table(
            text=self.text,
            collapse_user=self.collapse_user,
            external_links=self.external_links,
            citations=self.citations,
            wikilinks=self.wikilinks,
            templates=self.templates,
            headings=self.headings,
        )

        # Extract list of namespaces
        self.namespaces = {
            ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
        }

        page_count = 0
        rev_count = 0

        schema = build_schema(
            table,
            diff=self.diff,
            persist=self.persist,
            text=self.text,
            regex_revision_pairs=self.regex_revision_pairs,
            regex_comment_pairs=self.regex_comment_pairs,
        )

        # Add RevisionText to table for diff/persist computation (extraction only, not output)
        if (self.diff or self.persist != PersistMethod.none) and not self.text:
            table.columns.append(tables.RevisionText())

        # Initialize writer
        writer = None
        sorting_cols = None
        ns_base_paths = {}
        pq_writers = {}
        part_numbers = {}

        if self.output_parquet:
            pageid_sortingcol = pq.SortingColumn(schema.get_field_index("articleid"))
            revid_sortingcol = pq.SortingColumn(schema.get_field_index("revid"))
            sorting_cols = [pageid_sortingcol, revid_sortingcol]

            if self.resume_point is not None:
                if self.partition_namespaces:
                    for ns, resume_data in self.resume_point.items():
                        part_numbers[ns] = resume_data[2] if len(resume_data) > 2 else 0
                else:
                    part_numbers[None] = self.resume_point[2] if len(self.resume_point) > 2 else 0

            if not self.partition_namespaces:
                if self.max_revisions_per_file > 0:
                    output_path_with_part = self._get_part_path(self.output_file, part_numbers.get(None, 0))
                else:
                    output_path_with_part = self.output_file
                writer = pq.ParquetWriter(
                    output_path_with_part,
                    schema,
                    flavor="spark",
                    sorting_columns=sorting_cols,
                )
            else:
                output_path = Path(self.output_file)
                if self.namespace_filter is not None:
                    namespaces = self.namespace_filter
                else:
                    namespaces = self.namespaces.values()
                ns_base_paths = {
                    ns: (output_path.parent / f"namespace={ns}") / output_path.name
                    for ns in namespaces
                }
                for ns in namespaces:
                    if ns not in part_numbers:
                        part_numbers[ns] = 0
        elif self.output_jsonl:
            append_mode = self.resume_point is not None
            if self.output_jsonl_dir:
                # Create directory for JSONL output
                Path(self.output_file).mkdir(parents=True, exist_ok=True)
                part_num = 0
                if self.resume_point is not None and len(self.resume_point) > 2:
                    part_num = self.resume_point[2]
                part_numbers[None] = part_num
                jsonl_path = self._get_part_path(
                    Path(self.output_file) / "data.jsonl", part_num
                )
                writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
            else:
                writer = JSONLWriter(self.output_file, schema, append=append_mode)
        else:
            writer = pacsv.CSVWriter(
                self.output_file,
                schema,
                write_options=pacsv.WriteOptions(delimiter="\t"),
            )

        # Initialize diff machinery
        differ = None
        fast_differ = None
        if self.diff:
            differ = pywikidiff2.pywikidiff2(
                num_context_lines=1000000,
                max_word_level_diff_complexity=-1,
                moved_paragraph_detection_cutoff=-1,
                words_cache_capacity=10000,
                diff_cache_capacity=10000,
                stats_cache_capacity=10000,
            )
            fast_differ = pywikidiff2.pywikidiff2(
                num_context_lines=1000000,
                max_word_level_diff_complexity=40000000,
                moved_paragraph_detection_cutoff=100,
                words_cache_capacity=-1,
                diff_cache_capacity=-1,
                stats_cache_capacity=-1,
            )

        # Write buffer: accumulate rows before flushing
        write_buffer = defaultdict(list)
        buffer_count = 0
        last_namespace = None

        def flush_buffer():
            nonlocal write_buffer, buffer_count, last_namespace
            if buffer_count == 0:
                return
            row_buffer = dict(write_buffer)
            namespace = last_namespace
            if self.output_parquet:
                if self.partition_namespaces:
                    self._write_batch(
                        row_buffer, schema, writer, pq_writers, ns_base_paths,
                        sorting_cols, namespace=namespace, part_numbers=part_numbers
                    )
                else:
                    writer.write(pa.record_batch(row_buffer, schema=schema))
            elif self.output_jsonl:
                writer.write_batch(row_buffer)
            else:
                writer.write(pa.record_batch(row_buffer, schema=schema))

            # Update checkpoint
            last_pageid = row_buffer["articleid"][-1]
            last_revid = row_buffer["revid"][-1]
            part = part_numbers.get(namespace if self.partition_namespaces else None, 0)
            self._update_checkpoint(last_pageid, last_revid,
                                    namespace=namespace if self.partition_namespaces else None,
                                    part=part)
            write_buffer = defaultdict(list)
            buffer_count = 0

        # Iterate through pages
        for page in dump:
            # Skip namespaces not in the filter
            if self.namespace_filter is not None:
                if page.mwpage.namespace not in self.namespace_filter:
                    continue

            # Resume logic: skip pages before the resume point
            is_resume_page = False
            page_resume_revid = None
            if self.resume_point is not None and not found_resume_point:
                page_id = page.mwpage.id
                resume_pageid, resume_revid = self.resume_point[0], self.resume_point[1]
                if page_id < resume_pageid:
                    continue
                elif page_id == resume_pageid:
                    is_resume_page = True
                    page_resume_revid = resume_revid
                else:
                    found_resume_point = True

            # Reset revert detector for new page
            if self.revert_radius > 0:
                reverts_column.rev_detector = mwreverts.Detector(radius=self.revert_radius)
            else:
                reverts_column.rev_detector = None

            # State for this page
            prev_text = ""
            persist_state = None
            persist_window = None

            if self.persist != PersistMethod.none:
                persist_window = deque(maxlen=PERSISTENCE_RADIUS)
                if self.persist == PersistMethod.sequence:
                    persist_state = mwpersistence.DiffState(
                        SequenceMatcher(tokenizer=wikitext_split),
                        revert_radius=PERSISTENCE_RADIUS,
                    )
                elif self.persist == PersistMethod.segment:
                    persist_state = mwpersistence.DiffState(
                        SegmentMatcher(tokenizer=wikitext_split),
                        revert_radius=PERSISTENCE_RADIUS,
                    )
                elif self.persist == PersistMethod.wikidiff2:
                    wikidiff_matcher = WikiDiffMatcher(tokenizer=wikitext_split)
                    persist_state = mwpersistence.DiffState(
                        wikidiff_matcher, revert_radius=PERSISTENCE_RADIUS
                    )
                else:
                    from mw.lib import persistence
                    persist_state = persistence.State()

            # Pending persistence values waiting for window to fill
            pending_persistence = []

            # Use peekable to detect last revision in page
            revs_iter = peekable(page)

            for revs in revs_iter:
                # revs is either a single revision or a group (collapse_user mode)
                revs = list(revs)
                revs = fix_hex_digests(revs)
                rev = revs[-1]  # Last revision in the group
                is_last_in_page = revs_iter.peek(None) is None

                # Skip revisions before resume point
                if is_resume_page:
                    if rev.id <= page_resume_revid:
                        # Update state for correctness when we resume output
                        if self.diff or self.persist != PersistMethod.none:
                            prev_text = rev.text or ""
                        if persist_state is not None:
                            text = rev.text or ""
                            if self.persist != PersistMethod.legacy:
                                persist_state.update(text, rev.id)
                            else:
                                persist_state.process(text, rev.id)
                        # Update revert detector so it has history for post-resume revisions
                        if reverts_column.rev_detector is not None and not rev.deleted.text:
                            reverts_column.rev_detector.process(rev.sha1, rev.id)
                        if rev.id == page_resume_revid:
                            found_resume_point = True
                            is_resume_page = False
                            print(f"Resuming output after revid {rev.id}", file=sys.stderr)
                        continue

                rev_count += 1

                # Extract base row data
                row = table.extract_row(page.mwpage, revs)

                # Compute revert flag
                if self.revert_radius == 0 or row["deleted"]:
                    row["revert"] = None
                else:
                    row["revert"] = row["reverteds"] is not None

                # Regex matching
                regex_dict = self.matchmake_revision(rev)
                for k, v in regex_dict.items():
                    row[k] = v

                # Compute diff
                text = row.get("text", "") or ""
                if self.diff:
                    diff_result, timed_out = diff_with_timeout(differ, prev_text, text)
                    if timed_out:
                        print(f"WARNING! wikidiff2 timeout for rev: {rev.id}. Falling back to default limits.", file=sys.stderr)
                        diff_result = fast_differ.inline_json_diff(prev_text, text)
                    row["diff"] = [entry for entry in json.loads(diff_result)["diff"] if entry["type"] != 0]
                    row["diff_timeout"] = timed_out

                # Compute persistence
                if persist_state is not None:
                    if self.persist != PersistMethod.legacy:
                        _, tokens_added, tokens_removed = persist_state.update(text, rev.id)
                    else:
                        _, tokens_added, tokens_removed = persist_state.process(text, rev.id)

                    persist_window.append((rev.id, tokens_added, tokens_removed))
                    pending_persistence.append(row)

                    # When window is full, emit persistence for oldest revision
                    if len(persist_window) == PERSISTENCE_RADIUS:
                        old_rev_id, old_tokens_added, old_tokens_removed = persist_window.popleft()
                        oldest_row = pending_persistence.pop(0)
                        num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
                        oldest_row["token_revs"] = num_token_revs
                        oldest_row["tokens_added"] = num_tokens
                        oldest_row["tokens_removed"] = len(old_tokens_removed)
                        oldest_row["tokens_window"] = PERSISTENCE_RADIUS - 1

                        # Remove text if not outputting it
                        if not self.text and "text" in oldest_row:
                            del oldest_row["text"]

                        # Add to write buffer
                        for k, v in oldest_row.items():
                            write_buffer[k].append(v)
                        buffer_count += 1
                        last_namespace = page.mwpage.namespace

                        if buffer_count >= self.batch_size:
                            flush_buffer()

                # Update prev_text for next iteration
                if self.diff or self.persist != PersistMethod.none:
                    prev_text = text

                # If no persistence, write row directly
                if persist_state is None:
                    if not self.text and "text" in row:
                        del row["text"]

                    for k, v in row.items():
                        write_buffer[k].append(v)
                    buffer_count += 1
                    last_namespace = page.mwpage.namespace

                    if buffer_count >= self.batch_size:
                        flush_buffer()

                # Check for shutdown
                if self.shutdown_requested:
                    print("Shutdown requested, closing writers...", file=sys.stderr)
                    break

            # End of page: flush remaining persistence window
            if persist_state is not None and not self.shutdown_requested:
                for i, (pending_row, window_item) in enumerate(zip(pending_persistence, persist_window)):
                    rev_id, tokens_added, tokens_removed = window_item
                    num_token_revs, num_tokens = calculate_persistence(tokens_added)
                    pending_row["token_revs"] = num_token_revs
                    pending_row["tokens_added"] = num_tokens
                    pending_row["tokens_removed"] = len(tokens_removed)
                    pending_row["tokens_window"] = len(persist_window) - (i + 1)

                    if not self.text and "text" in pending_row:
                        del pending_row["text"]

                    for k, v in pending_row.items():
                        write_buffer[k].append(v)
                    buffer_count += 1
                    last_namespace = page.mwpage.namespace

            if self.shutdown_requested:
                break
            page_count += 1

        # Flush remaining buffer
        flush_buffer()

        # Cancel time limit timer
        self._cancel_time_limit_timer(time_limit_timer)

        print(
            "Done: %s revisions and %s pages." % (rev_count, page_count),
            file=sys.stderr,
        )

        # Close all writers
        if self.output_parquet and self.partition_namespaces:
            for pq_writer in pq_writers.values():
                pq_writer.close()
        elif writer is not None:
            writer.close()

        # Close checkpoint file; delete it only if we completed without interruption
        self._close_checkpoint(delete=not self.shutdown_requested)

        # Merge temp output with original for parquet resume
        if original_output_file is not None and temp_output_file is not None:
            finalize_resume_merge(
                original_output_file,
                temp_output_file,
                self.partition_namespaces,
                original_partition_dir
            )

def match_archive_suffix(input_filename):
    if re.match(r".*\.7z$", input_filename):
        cmd = ["7za", "x", "-so", input_filename]
    elif re.match(r".*\.gz$", input_filename):
        cmd = ["zcat", input_filename]
    elif re.match(r".*\.bz2$", input_filename):
        cmd = ["bzcat", "-dk", input_filename]
    else:
        raise ValueError("Unrecognized file type: %s" % input_filename)
    return cmd


def open_input_file(input_filename, fandom_2020=False):
    cmd = match_archive_suffix(input_filename)
    if fandom_2020:
        cmd.append("*.xml")
    try:
        return Popen(cmd, stdout=PIPE).stdout
    except NameError:
        return open(input_filename, "r")


def get_output_filename(input_filename, output_format='tsv') -> str:
    """Generate output filename based on input filename and format.

    Args:
        input_filename: Input dump file path
        output_format: 'tsv', 'jsonl', or 'parquet'
    """
    output_filename = re.sub(r"\.(7z|gz|bz2)?$", "", input_filename)
    output_filename = re.sub(r"\.xml", "", output_filename)
    if output_format == 'jsonl':
        output_filename = output_filename + ".jsonl"
    elif output_format == 'parquet':
        output_filename = output_filename + ".parquet"
    else:
        output_filename = output_filename + ".tsv"
    return output_filename


def main():
    parser = argparse.ArgumentParser(
        description="Parse MediaWiki XML database dumps into tab delimited data."
    )

    # arguments for the input direction
    parser.add_argument(
        "dumpfiles",
        metavar="DUMPFILE",
        nargs="*",
        type=str,
        help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.",
    )

    parser.add_argument(
        "-o",
        "--output",
        metavar="OUTPUT",
        dest="output",
        type=str,
        nargs=1,
        help="Output file or directory. Format is detected from extension: .jsonl for JSONL, .parquet for Parquet, otherwise TSV.",
    )

    parser.add_argument(
        "-s",
        "--stdout",
        dest="stdout",
        action="store_true",
        help="Write output to standard out (do not create dump file)",
    )

    parser.add_argument(
        "--print-schema",
        dest="print_schema",
        action="store_true",
        help="Print the Spark-compatible JSON schema for the output and exit. No dump file is processed.",
    )

    parser.add_argument(
        "--collapse-user",
        dest="collapse_user",
        action="store_true",
        help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.",
    )

    parser.add_argument(
        "-p",
        "--persistence",
        dest="persist",
        default=None,
        const="",
        type=str,
        choices=["", "wikidiff2", "segment", "sequence", "legacy"],
        nargs="?",
        help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.",
    )

    parser.add_argument(
        "-n",
        "--namespace-include",
        dest="namespace_filter",
        type=int,
        action="append",
        help="Id number of namespace to include. Can be specified more than once.",
    )

    parser.add_argument(
        "-rr",
        "--revert-radius",
        dest="revert_radius",
        type=int,
        action="store",
        default=15,
        help="Number of edits to check when looking for reverts (default: 15)",
    )

    parser.add_argument(
        "-RP",
        "--revision-pattern",
        dest="regex_match_revision",
        default=None,
        type=str,
        action="append",
        help="The regular expression to search for in revision text. The regex must be surrounded by quotes.",
    )

    parser.add_argument(
        "-RPl",
        "--revision-pattern-label",
        dest="regex_revision_label",
        default=None,
        type=str,
        action="append",
        help="The label for the outputted column based on matching the regex in revision text.",
    )

    parser.add_argument(
        "-CP",
        "--comment-pattern",
        dest="regex_match_comment",
        default=None,
        type=str,
        action="append",
        help="The regular expression to search for in comments of revisions.",
    )

    parser.add_argument(
        "-CPl",
        "--comment-pattern-label",
        dest="regex_comment_label",
        default=None,
        type=str,
        action="append",
        help="The label for the outputted column based on matching the regex in comments.",
    )

    parser.add_argument(
        "-d",
        "--diff",
        dest="diff",
        default=False,
        action="store_true",
        help="Output a diff structure for each revision with information about changed or moved lines.",
    )

    parser.add_argument(
        "-t",
        "--text",
        dest="text",
        default=False,
        action="store_true",
        help="Output the text of the revision.",
    )

    parser.add_argument(
        "--external-links",
        dest="external_links",
        action="store_true",
        default=False,
        help="Extract external links from each revision using mwparserfromhell.",
    )

    parser.add_argument(
        "--citations",
        dest="citations",
        action="store_true",
        default=False,
        help="Extract citations (ref tags and cite templates) from each revision.",
    )

    parser.add_argument(
        "--wikilinks",
        dest="wikilinks",
        action="store_true",
        default=False,
        help="Extract internal wikilinks from each revision.",
    )

    parser.add_argument(
        "--templates",
        dest="templates",
        action="store_true",
        default=False,
        help="Extract templates with their parameters from each revision.",
    )

    parser.add_argument(
        "--headings",
        dest="headings",
        action="store_true",
        default=False,
        help="Extract section headings from each revision.",
    )

    parser.add_argument(
        "--fandom-2020",
        dest="fandom_2020",
        action="store_true",
        help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.",
    )

    parser.add_argument(
        "--batch-size",
        dest="batch_size",
        default=1500,
        type=int,
        help="How many revisions to process in each batch. This ends up being the Parquet row group size",
    )

    parser.add_argument(
        "--resume",
        dest="resume",
        action="store_true",
        help="Resume processing from the last successfully written revision in the output file.",
    )

    parser.add_argument(
        "--time-limit",
        dest="time_limit",
        type=float,
        default=0,
        help="Time limit in hours before graceful shutdown. Set to 0 to disable (default).",
    )

    parser.add_argument(
        "--partition-namespaces",
        dest="partition_namespaces",
        action="store_true",
        default=False,
        help="For Parquet output, partition output by namespace into separate files.",
    )

    parser.add_argument(
        "--max-revisions-per-file",
        dest="max_revisions_per_file",
        type=int,
        default=0,
        help="For Parquet output, split output into multiple files after this many revisions. Set to 0 to disable (default).",
    )

    args = parser.parse_args()

    # set persistence method

    if args.persist is None:
        persist = PersistMethod.none
    elif args.persist == "segment":
        persist = PersistMethod.segment
    elif args.persist == "legacy":
        persist = PersistMethod.legacy
    elif args.persist == "wikidiff2":
        persist = PersistMethod.wikidiff2
    else:
        persist = PersistMethod.sequence

    if args.namespace_filter is not None:
        namespaces = args.namespace_filter
    else:
        namespaces = None

    # Handle --print-schema: build and output schema, then exit
    if args.print_schema:
        regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label)
        regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label)

        table, _ = build_table(
            text=args.text,
            collapse_user=args.collapse_user,
            external_links=args.external_links,
            citations=args.citations,
            wikilinks=args.wikilinks,
            templates=args.templates,
            headings=args.headings,
        )
        schema = build_schema(
            table,
            diff=args.diff,
            persist=persist,
            text=args.text,
            regex_revision_pairs=regex_revision_pairs,
            regex_comment_pairs=regex_comment_pairs,
        )

        spark_schema = pyarrow_to_spark_schema(schema)
        print(json.dumps(spark_schema, indent=2))
        sys.exit(0)

    print(args, file=sys.stderr)
    if len(args.dumpfiles) > 0:
        for filename in args.dumpfiles:
            # Determine output file path before opening input (so resume errors are caught early)
            if args.output:
                output = args.output[0]
            else:
                output = "."

            # Detect output format from extension
            output_jsonl_dir = output.endswith(".jsonl.d")
            output_jsonl = output.endswith(".jsonl") or output_jsonl_dir
            output_parquet = output.endswith(".parquet")
            partition_namespaces = args.partition_namespaces and output_parquet

            if args.stdout:
                output_file = sys.stdout.buffer
            elif output_jsonl or output_parquet:
                # Output is a JSONL or Parquet file path - use it directly
                output_file = output
            elif os.path.isdir(output):
                # Output is a directory - derive filename from input
                output_filename = os.path.join(output, os.path.basename(filename))
                output_file = get_output_filename(output_filename, output_format='tsv')
            else:
                output_file = output

            # Handle resume functionality before opening input file
            resume_point = None
            if args.resume:
                if (output_jsonl or output_parquet) and not args.stdout:
                    # Clean up any interrupted resume from previous run
                    if output_parquet:
                        cleanup_result = cleanup_interrupted_resume(output_file, partition_namespaces)
                        if cleanup_result == "start_fresh":
                            resume_point = None
                        else:
                            resume_point = get_resume_point(output_file, partition_namespaces)
                    else:
                        # JSONL: get resume point from last line of file (no checkpoint)
                        resume_point = get_resume_point(output_file, input_file=filename)
                    if resume_point is not None:
                        if isinstance(resume_point, dict):
                            print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)
                        else:
                            pageid, revid = resume_point[0], resume_point[1]
                            print(f"Resuming from checkpoint: pageid={pageid}, revid={revid}", file=sys.stderr)
                else:
                    sys.exit("Error: --resume only works with JSONL or Parquet output (not stdout or TSV)")

            # Now open the input file
            print("Processing file: %s" % filename, file=sys.stderr)
            input_file = open_input_file(filename, args.fandom_2020)

            time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None

            wikiq = WikiqParser(
                input_file,
                output_file,
                collapse_user=args.collapse_user,
                persist=persist,
                namespaces=namespaces,
                revert_radius=args.revert_radius,
                regex_match_revision=args.regex_match_revision,
                regex_revision_label=args.regex_revision_label,
                regex_match_comment=args.regex_match_comment,
                regex_comment_label=args.regex_comment_label,
                text=args.text,
                diff=args.diff,
                output_jsonl=output_jsonl,
                output_jsonl_dir=output_jsonl_dir,
                output_parquet=output_parquet,
                partition_namespaces=partition_namespaces,
                batch_size=args.batch_size,
                resume_point=resume_point,
                external_links=args.external_links,
                citations=args.citations,
                wikilinks=args.wikilinks,
                templates=args.templates,
                headings=args.headings,
                time_limit_seconds=time_limit_seconds,
                max_revisions_per_file=args.max_revisions_per_file,
            )

            # Register signal handlers for graceful shutdown (CLI only)
            def handle_shutdown(signum, frame):
                sig_name = signal.Signals(signum).name
                print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
                wikiq.request_shutdown()

            original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
            original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
            original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
            original_sigusr2 = signal.signal(signal.SIGUSR2, handle_shutdown)

            try:
                wikiq.process()
            finally:
                # Restore original signal handlers
                signal.signal(signal.SIGTERM, original_sigterm)
                signal.signal(signal.SIGINT, original_sigint)
                signal.signal(signal.SIGUSR1, original_sigusr1)
                signal.signal(signal.SIGUSR2, original_sigusr2)

            # close things
            input_file.close()

    else:
        if args.resume:
            print("Warning: --resume cannot be used with stdin/stdout", file=sys.stderr)

        time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None

        wikiq = WikiqParser(
            sys.stdin,
            sys.stdout,
            collapse_user=args.collapse_user,
            persist=persist,
            # persist_legacy=args.persist_legacy,
            namespaces=namespaces,
            revert_radius=args.revert_radius,
            regex_match_revision=args.regex_match_revision,
            regex_revision_label=args.regex_revision_label,
            regex_match_comment=args.regex_match_comment,
            regex_comment_label=args.regex_comment_label,
            diff=args.diff,
            text=args.text,
            batch_size=args.batch_size,
            resume_point=None,
            external_links=args.external_links,
            citations=args.citations,
            wikilinks=args.wikilinks,
            templates=args.templates,
            headings=args.headings,
            time_limit_seconds=time_limit_seconds,
        )

        # Register signal handlers for graceful shutdown (CLI only)
        def handle_shutdown(signum, frame):
            sig_name = signal.Signals(signum).name
            print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
            wikiq.request_shutdown()

        original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
        original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
        original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)

        try:
            wikiq.process()
        finally:
            # Restore original signal handlers
            signal.signal(signal.SIGTERM, original_sigterm)
            signal.signal(signal.SIGINT, original_sigint)
            signal.signal(signal.SIGUSR1, original_sigusr1)

    # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
    # stop_words = stop_words.split(",")


if __name__ == "__main__":
    main()