mediawiki_dump_tools/src/wikiq/resume.py

"""
Checkpoint and resume functionality for wikiq parquet output.

This module handles:
- Finding resume points in existing parquet output
- Merging resumed data with existing output (streaming, memory-efficient)
- Checkpoint file management for fast resume point lookup
"""

import json
import os
import sys

import pyarrow.parquet as pq


def cleanup_interrupted_resume(output_file, partition_namespaces):
    """
    Merge any leftover .resume_temp files from a previous interrupted run.

    This should be called BEFORE get_resume_point() so the resume point
    is calculated from the merged data.

    Returns:
        None - no temp files found or normal merge completed
        "start_fresh" - both original and temp were corrupted and deleted
    """
    import shutil

    if partition_namespaces:
        partition_dir = os.path.dirname(output_file)
        output_filename = os.path.basename(output_file)
        temp_suffix = ".resume_temp"

        if not os.path.isdir(partition_dir):
            return

        has_old_temp_files = False
        for ns_dir in os.listdir(partition_dir):
            if ns_dir.startswith('namespace='):
                temp_path = os.path.join(partition_dir, ns_dir, output_filename + temp_suffix)
                if os.path.exists(temp_path):
                    has_old_temp_files = True
                    break

        if has_old_temp_files:
            print(f"Found leftover temp files in {partition_dir} from previous interrupted partitioned run, merging first...", file=sys.stderr)
            had_corruption = merge_partitioned_namespaces(partition_dir, temp_suffix)

            # Check if any valid data remains after merge
            has_valid_data = False
            for ns_dir in os.listdir(partition_dir):
                if ns_dir.startswith('namespace='):
                    ns_path = os.path.join(partition_dir, ns_dir)
                    parquet_files = [f for f in os.listdir(ns_path) if f.endswith('.parquet') and not f.endswith('.resume_temp')]
                    if parquet_files:
                        has_valid_data = True
                        break

            if had_corruption and not has_valid_data:
                # All data was corrupted, remove checkpoint and start fresh
                checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
                if os.path.exists(checkpoint_path):
                    os.remove(checkpoint_path)
                print("All partitioned files were corrupted, will start fresh.", file=sys.stderr)
                return "start_fresh"

            print("Previous temp files merged successfully.", file=sys.stderr)
    else:
        temp_output_file = output_file + ".resume_temp"
        if os.path.exists(temp_output_file) and not os.path.isdir(temp_output_file):
            print(f"Found leftover temp file {temp_output_file} from previous interrupted run, merging first...", file=sys.stderr)
            merged_path = output_file + ".merged"
            merged = merge_parquet_files(output_file, temp_output_file, merged_path)
            if merged == "original_only":
                # Temp file was invalid, just remove it
                os.remove(temp_output_file)
            elif merged == "temp_only":
                # Original was corrupted, use temp as new base
                os.remove(output_file)
                os.rename(temp_output_file, output_file)
                print("Recovered from temp file (original was corrupted).", file=sys.stderr)
            elif merged == "both_invalid":
                # Both files corrupted, remove both and start fresh
                os.remove(output_file)
                os.remove(temp_output_file)
                # Also remove stale checkpoint file
                checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
                if os.path.exists(checkpoint_path):
                    os.remove(checkpoint_path)
                print("Both files were corrupted, will start fresh.", file=sys.stderr)
                return "start_fresh"
            elif merged == "merged":
                os.remove(output_file)
                os.rename(merged_path, output_file)
                os.remove(temp_output_file)
                print("Previous temp file merged successfully.", file=sys.stderr)
            else:
                # Both empty - unusual
                os.remove(temp_output_file)


def get_checkpoint_path(output_file, partition_namespaces=False):
    """Get the path to the checkpoint file for a given output file.

    For partitioned output, the checkpoint is placed outside the partition directory
    to avoid pyarrow trying to read it as a parquet file. The filename includes
    the output filename to keep it unique per input file (for parallel jobs).
    """
    if partition_namespaces:
        # output_file is like partition_dir/output.parquet
        # checkpoint should be at parent level: parent/output.parquet.checkpoint
        partition_dir = os.path.dirname(output_file)
        output_filename = os.path.basename(output_file)
        parent_dir = os.path.dirname(partition_dir)
        return os.path.join(parent_dir, output_filename + ".checkpoint")
    return str(output_file) + ".checkpoint"


def read_checkpoint(output_file, partition_namespaces=False):
    """
    Read resume point from checkpoint file if it exists.

    Checkpoint format:
        Single file: {"pageid": 54, "revid": 325}
        Partitioned: {"0": {"pageid": 54, "revid": 325}, "1": {"pageid": 123, "revid": 456}}

    Returns:
        For single files: A tuple (pageid, revid), or None if not found.
        For partitioned: A dict mapping namespace -> (pageid, revid), or None.
    """
    checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
    if not os.path.exists(checkpoint_path):
        return None

    try:
        with open(checkpoint_path, 'r') as f:
            data = json.load(f)

        if not data:
            return None

        # Single-file format: {"pageid": ..., "revid": ...}
        if "pageid" in data and "revid" in data:
            return (data["pageid"], data["revid"])

        # Partitioned format: {"0": {"pageid": ..., "revid": ...}, ...}
        result = {}
        for key, value in data.items():
            result[int(key)] = (value["pageid"], value["revid"])

        return result if result else None

    except (json.JSONDecodeError, IOError, KeyError, TypeError) as e:
        print(f"Warning: Could not read checkpoint file {checkpoint_path}: {e}", file=sys.stderr)
        return None


def get_resume_point(output_file, partition_namespaces=False):
    """
    Find the resume point(s) from existing parquet output.

    First checks for a checkpoint file (fast), then falls back to scanning
    the parquet output (slow, for backwards compatibility).

    Args:
        output_file: Path to the output file. For single files, this is the parquet file path.
                     For partitioned namespaces, this is the path like dir/dump.parquet where
                     namespace=* subdirectories are in the parent dir.
        partition_namespaces: Whether the output uses namespace partitioning.

    Returns:
        For single files: A tuple (pageid, revid) for the row with the highest pageid,
                         or None if not found.
        For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
                        or None if no partitions exist.
    """
    # First try checkpoint file (fast)
    checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
    checkpoint_result = read_checkpoint(output_file, partition_namespaces)
    if checkpoint_result is not None:
        print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
        return checkpoint_result

    # Fall back to scanning parquet (slow, for backwards compatibility)
    print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
    try:
        if partition_namespaces:
            return _get_resume_point_partitioned(output_file)
        else:
            return _get_resume_point_single_file(output_file)
    except Exception as e:
        print(f"Error reading resume point from {output_file}: {e}", file=sys.stderr)
        return None


def _get_last_row_resume_point(pq_path):
    """Get resume point by reading only the last row group of a parquet file.

    Since data is written in page/revision order, the last row group contains
    the highest pageid/revid, and the last row in that group is the resume point.
    """
    pf = pq.ParquetFile(pq_path)
    if pf.metadata.num_row_groups == 0:
        return None

    last_rg_idx = pf.metadata.num_row_groups - 1
    table = pf.read_row_group(last_rg_idx, columns=['articleid', 'revid'])
    if table.num_rows == 0:
        return None

    max_pageid = table['articleid'][-1].as_py()
    max_revid = table['revid'][-1].as_py()
    return (max_pageid, max_revid)


def _get_resume_point_partitioned(output_file):
    """Find per-namespace resume points from partitioned output.

    Only looks for the specific output file in each namespace directory.
    Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition
    where the output file exists.

    Args:
        output_file: Path like 'dir/output.parquet' where namespace=* subdirectories
                     contain files named 'output.parquet'.
    """
    partition_dir = os.path.dirname(output_file)
    output_filename = os.path.basename(output_file)

    if not os.path.exists(partition_dir) or not os.path.isdir(partition_dir):
        return None

    namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
    if not namespace_dirs:
        return None

    resume_points = {}
    for ns_dir in namespace_dirs:
        ns = int(ns_dir.split('=')[1])
        pq_path = os.path.join(partition_dir, ns_dir, output_filename)

        if not os.path.exists(pq_path):
            continue

        try:
            result = _get_last_row_resume_point(pq_path)
            if result is not None:
                resume_points[ns] = result
        except Exception as e:
            print(f"Warning: Could not read {pq_path}: {e}", file=sys.stderr)
            continue

    return resume_points if resume_points else None


def _get_resume_point_single_file(output_file):
    """Find resume point from a single parquet file."""
    if not os.path.exists(output_file):
        return None

    if os.path.isdir(output_file):
        return None

    return _get_last_row_resume_point(output_file)


def merge_parquet_files(original_path, temp_path, merged_path):
    """
    Merge two parquet files by streaming row groups from original and temp into merged.

    This is memory-efficient: only one row group is loaded at a time.
    Returns:
        "merged" - merged file was created from both sources
        "original_only" - temp was invalid, keep original unchanged
        "temp_only" - original was corrupted but temp is valid, use temp
        "both_invalid" - both files invalid, delete both and start fresh
        False - both files were valid but empty
    """
    original_valid = False
    temp_valid = False
    original_pq = None
    temp_pq = None

    try:
        original_pq = pq.ParquetFile(original_path)
        original_valid = True
    except Exception as e:
        print(f"Warning: Original file {original_path} is corrupted or invalid: {e}", file=sys.stderr)

    try:
        temp_pq = pq.ParquetFile(temp_path)
        temp_valid = True
    except Exception:
        print(f"Note: No new data in temp file {temp_path} (namespace had no records after resume point)", file=sys.stderr)

    if not original_valid and not temp_valid:
        print(f"Both original and temp files are invalid, will start fresh", file=sys.stderr)
        return "both_invalid"

    if not original_valid and temp_valid:
        print(f"Original file corrupted but temp file is valid, recovering from temp", file=sys.stderr)
        return "temp_only"

    if original_valid and not temp_valid:
        return "original_only"

    merged_writer = None

    # Copy all row groups from the original file
    for i in range(original_pq.num_row_groups):
        row_group = original_pq.read_row_group(i)
        if merged_writer is None:
            merged_writer = pq.ParquetWriter(
                merged_path,
                row_group.schema,
                flavor="spark"
            )
        merged_writer.write_table(row_group)

    # Append all row groups from the temp file
    for i in range(temp_pq.num_row_groups):
        row_group = temp_pq.read_row_group(i)
        if merged_writer is None:
            merged_writer = pq.ParquetWriter(
                merged_path,
                row_group.schema,
                flavor="spark"
            )
        merged_writer.write_table(row_group)

    # Close the writer
    if merged_writer is not None:
        merged_writer.close()
        return "merged"
    return False


def merge_partitioned_namespaces(partition_dir, temp_suffix):
    """
    Merge partitioned namespace directories after resume.

    For partitioned namespaces, temp files are written alongside the original files
    in each namespace directory with the temp suffix appended to the filename.
    E.g., original: namespace=0/file.parquet, temp: namespace=0/file.parquet.resume_temp

    Args:
        partition_dir: The partition directory containing namespace=* subdirs
        temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')

    Returns:
        True if at least one namespace has valid data after merge
        False if all namespaces ended up with corrupted/deleted data
    """
    namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
    had_corruption = False

    for ns_dir in namespace_dirs:
        ns_path = os.path.join(partition_dir, ns_dir)

        # Find all files in this namespace directory
        files = os.listdir(ns_path)

        # Find temp files (files ending with the temp suffix)
        temp_files = [f for f in files if f.endswith(temp_suffix)]

        for temp_file in temp_files:
            temp_path = os.path.join(ns_path, temp_file)
            # Original file is the temp file without the suffix
            original_file = temp_file[:-len(temp_suffix)]
            original_path = os.path.join(ns_path, original_file)

            if os.path.exists(original_path):
                # Merge the files
                merged_path = original_path + ".merged"
                merged = merge_parquet_files(original_path, temp_path, merged_path)

                if merged == "original_only":
                    # Temp file was invalid (no new data), keep original unchanged
                    os.remove(temp_path)
                elif merged == "temp_only":
                    # Original was corrupted, use temp as new base
                    os.remove(original_path)
                    os.rename(temp_path, original_path)
                elif merged == "both_invalid":
                    # Both files corrupted, remove both
                    os.remove(original_path)
                    os.remove(temp_path)
                    had_corruption = True
                elif merged == "merged":
                    # Replace the original file with the merged file
                    os.remove(original_path)
                    os.rename(merged_path, original_path)
                    os.remove(temp_path)
                else:
                    # Both files were empty (False), just remove them
                    os.remove(original_path)
                    os.remove(temp_path)
            else:
                # No original file, rename temp to original only if valid
                try:
                    pq.ParquetFile(temp_path)
                    os.rename(temp_path, original_path)
                except Exception:
                    # Temp file invalid, just remove it
                    os.remove(temp_path)
                    had_corruption = True

    return had_corruption


def finalize_resume_merge(
    original_output_file,
    temp_output_file,
    partition_namespaces,
    original_partition_dir
):
    """
    Finalize the resume by merging temp output with original output.

    Args:
        original_output_file: Path to the original output file
        temp_output_file: Path to the temp output file written during resume
        partition_namespaces: Whether using partitioned namespace output
        original_partition_dir: The partition directory (for partitioned output)

    Raises:
        Exception: If merge fails (temp file is preserved for recovery)
    """
    import shutil

    print("Merging resumed data with existing output...", file=sys.stderr)
    try:
        if partition_namespaces and original_partition_dir is not None:
            # For partitioned namespaces, temp files are written alongside originals
            # with '.resume_temp' suffix in each namespace directory.
            merge_partitioned_namespaces(original_partition_dir, ".resume_temp")
            # Clean up the empty temp directory we created
            if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file):
                shutil.rmtree(temp_output_file)
        else:
            # Merge single parquet files
            merged_output_file = original_output_file + ".merged"
            merged = merge_parquet_files(original_output_file, temp_output_file, merged_output_file)

            if merged == "original_only":
                # Temp file was invalid (no new data), keep original unchanged
                if os.path.exists(temp_output_file):
                    os.remove(temp_output_file)
            elif merged == "temp_only":
                # Original was corrupted, use temp as new base
                os.remove(original_output_file)
                os.rename(temp_output_file, original_output_file)
            elif merged == "both_invalid":
                # Both files corrupted, remove both
                os.remove(original_output_file)
                if os.path.exists(temp_output_file):
                    os.remove(temp_output_file)
            elif merged == "merged":
                # Replace the original file with the merged file
                os.remove(original_output_file)
                os.rename(merged_output_file, original_output_file)
                if os.path.exists(temp_output_file):
                    os.remove(temp_output_file)
            else:
                # Both files were empty (False) - unusual, but clean up
                os.remove(original_output_file)
                if os.path.exists(temp_output_file):
                    os.remove(temp_output_file)

        print("Merge complete.", file=sys.stderr)
    except Exception as e:
        print(f"Error merging resume data for {original_output_file}: {e}", file=sys.stderr)
        print(f"New data saved in: {temp_output_file}", file=sys.stderr)
        raise


def setup_resume_temp_output(output_file, partition_namespaces):
    """
    Set up temp output for resume mode.

    Args:
        output_file: The original output file path
        partition_namespaces: Whether using partitioned namespace output

    Returns:
        Tuple of (original_output_file, temp_output_file, original_partition_dir)
        or (None, None, None) if no existing output to resume from.
    """
    import shutil

    original_output_file = None
    temp_output_file = None
    original_partition_dir = None

    # For partitioned namespaces, check if the specific output file exists in any namespace
    if partition_namespaces:
        partition_dir = os.path.dirname(output_file)
        output_filename = os.path.basename(output_file)
        output_exists = False
        if os.path.isdir(partition_dir):
            for d in os.listdir(partition_dir):
                if d.startswith('namespace='):
                    if os.path.exists(os.path.join(partition_dir, d, output_filename)):
                        output_exists = True
                        break
        if output_exists:
            original_partition_dir = partition_dir
    else:
        output_exists = isinstance(output_file, str) and os.path.exists(output_file)

    if output_exists:
        original_output_file = output_file
        temp_output_file = output_file + ".resume_temp"

        # Note: cleanup_interrupted_resume() should have been called before this
        # to merge any leftover temp files from a previous interrupted run.
        # Here we just clean up any remaining temp directory markers.
        if os.path.exists(temp_output_file):
            if os.path.isdir(temp_output_file):
                shutil.rmtree(temp_output_file)
            else:
                os.remove(temp_output_file)

        if partition_namespaces:
            os.makedirs(temp_output_file, exist_ok=True)

    return original_output_file, temp_output_file, original_partition_dir