#!/usr/bin/env python3 # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size import argparse import gc import json import os.path import re import signal import sys import threading import time from collections import deque, defaultdict from hashlib import sha1 from io import TextIOWrapper from itertools import groupby from subprocess import PIPE, Popen from typing import IO, Any, Generator, TextIO, Union import mwpersistence import mwreverts import mwxml import pywikidiff2 from deltas.tokenizers import wikitext_split from more_itertools import peekable from mwxml import Dump import wikiq.tables as tables from wikiq.tables import RevisionTable from wikiq.wiki_diff_matcher import WikiDiffMatcher from wikiq.wikitext_parser import WikitextParser from wikiq.resume import ( get_checkpoint_path, read_checkpoint, get_resume_point, setup_resume_temp_output, finalize_resume_merge, cleanup_interrupted_resume, ) TO_ENCODE = ("title", "editor") PERSISTENCE_RADIUS = 7 DIFF_TIMEOUT_MS = 60000 from pathlib import Path import pyarrow as pa import pyarrow.csv as pacsv import pyarrow.parquet as pq from deltas import SegmentMatcher, SequenceMatcher def pyarrow_type_to_spark(pa_type): """Convert a PyArrow type to Spark JSON schema format.""" if pa.types.is_int64(pa_type): return "long" elif pa.types.is_int32(pa_type): return "integer" elif pa.types.is_int8(pa_type): return "byte" elif pa.types.is_boolean(pa_type): return "boolean" elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): return "string" elif pa.types.is_timestamp(pa_type): return "timestamp" elif pa.types.is_list(pa_type): return { "type": "array", "elementType": pyarrow_type_to_spark(pa_type.value_type), "containsNull": True } elif pa.types.is_struct(pa_type): return { "type": "struct", "fields": [ { "name": field.name, "type": pyarrow_type_to_spark(field.type), "nullable": field.nullable, "metadata": {} } for field in pa_type ] } elif pa.types.is_map(pa_type): return { "type": "map", "keyType": pyarrow_type_to_spark(pa_type.key_type), "valueType": pyarrow_type_to_spark(pa_type.item_type), "valueContainsNull": True } else: return "string" def pyarrow_to_spark_schema(schema: pa.Schema) -> dict: """Convert a PyArrow schema to Spark JSON schema format.""" return { "type": "struct", "fields": [ { "name": field.name, "type": pyarrow_type_to_spark(field.type), "nullable": field.nullable, "metadata": {} } for field in schema ] } def build_table( text: bool = False, collapse_user: bool = False, external_links: bool = False, citations: bool = False, wikilinks: bool = False, templates: bool = False, headings: bool = False, ): """Build the RevisionTable with appropriate columns based on flags. Returns: (table, reverts_column) - the table and a reference to the reverts column (which process() needs for setting the revert detector). """ reverts_column = tables.RevisionReverts() table = RevisionTable([ tables.RevisionId(), tables.RevisionTimestamp(), tables.RevisionArticleId(), tables.RevisionPageTitle(), tables.RevisionNamespace(), tables.RevisionDeleted(), tables.RevisionEditorId(), tables.RevisionEditSummary(), tables.RevisionTextChars(), reverts_column, tables.RevisionSha1(), tables.RevisionIsMinor(), tables.RevisionEditorText(), tables.RevisionIsAnon(), ]) if text: table.columns.append(tables.RevisionText()) if collapse_user: table.columns.append(tables.RevisionCollapsed()) if external_links or citations or wikilinks or templates or headings: wikitext_parser = WikitextParser() if external_links: table.columns.append(tables.RevisionExternalLinks(wikitext_parser)) if citations: table.columns.append(tables.RevisionCitations(wikitext_parser)) if wikilinks: table.columns.append(tables.RevisionWikilinks(wikitext_parser)) if templates: table.columns.append(tables.RevisionTemplates(wikitext_parser)) if headings: table.columns.append(tables.RevisionHeadings(wikitext_parser)) table.columns.append(tables.RevisionParserTimeout(wikitext_parser)) return table, reverts_column def build_schema( table, diff: bool = False, persist: int = 0, text: bool = False, regex_revision_pairs: list = None, regex_comment_pairs: list = None, ) -> pa.Schema: """Build the PyArrow schema from a table, adding output-only fields.""" schema = table.schema() schema = schema.append(pa.field("revert", pa.bool_(), nullable=True)) if diff: from wikiq.diff_pyarrow_schema import diff_field schema = schema.append(diff_field) schema = schema.append(pa.field("diff_timeout", pa.bool_())) if regex_revision_pairs: for pair in regex_revision_pairs: for field in pair.get_pyarrow_fields(): schema = schema.append(field) if regex_comment_pairs: for pair in regex_comment_pairs: for field in pair.get_pyarrow_fields(): schema = schema.append(field) if persist != PersistMethod.none: # RevisionText is added to the table for extraction, but not to schema # (unless text=True, in which case it's already in the schema from build_table) schema = schema.append(pa.field("token_revs", pa.int64(), nullable=True)) schema = schema.append(pa.field("tokens_added", pa.int64(), nullable=True)) schema = schema.append(pa.field("tokens_removed", pa.int64(), nullable=True)) schema = schema.append(pa.field("tokens_window", pa.int64(), nullable=True)) return schema def make_regex_pairs(patterns, labels) -> list: """Create RegexPair objects from patterns and labels.""" if (patterns is not None and labels is not None) and (len(patterns) == len(labels)): return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)] elif patterns is None and labels is None: return [] else: sys.exit("Each regular expression *must* come with a corresponding label and vice versa.") class JSONLWriter: """Write JSONL output with schema validation.""" def __init__(self, output_file: str, schema: pa.Schema, append: bool = False): self.output_file = output_file self.schema = schema self.field_names = [field.name for field in schema] if append and os.path.exists(output_file): self._validate_and_fix_last_line(output_file) mode = "a" if append else "w" self._file = open(output_file, mode) def _validate_and_fix_last_line(self, filepath: str): """Validate the last line of JSONL file; truncate if corrupted. If the previous run was interrupted mid-write, the last line may be incomplete JSON. This detects and removes such corrupted lines. """ with open(filepath, 'rb') as f: f.seek(0, 2) file_size = f.tell() if file_size == 0: return # Read backwards to find the last newline chunk_size = min(8192, file_size) f.seek(-chunk_size, 2) chunk = f.read(chunk_size) # Find the last complete line last_newline = chunk.rfind(b'\n') if last_newline == -1: # Entire file is one line (possibly corrupted) last_line = chunk.decode('utf-8', errors='replace') truncate_pos = 0 else: last_line = chunk[last_newline + 1:].decode('utf-8', errors='replace') truncate_pos = file_size - chunk_size + last_newline + 1 # If last line is empty, file ends with newline - that's fine if not last_line.strip(): return # Try to parse the last line as JSON try: json.loads(last_line) except json.JSONDecodeError: print(f"Warning: Last line of {filepath} is corrupted JSON, removing it", file=sys.stderr) # Truncate the file to remove the corrupted last line with open(filepath, 'r+b') as f: f.truncate(truncate_pos) def write_batch(self, data: dict): """Write a batch of rows as JSONL. Args: data: dict mapping column names to lists of values """ if not data or not data.get(self.field_names[0]): return num_rows = len(data[self.field_names[0]]) for i in range(num_rows): row = {} for name in self.field_names: if name in data: value = data[name][i] row[name] = self._convert_value(value) self._file.write(json.dumps(row) + "\n") def _convert_value(self, value): """Convert a value to JSON-serializable format.""" if value is None: return None elif isinstance(value, (str, int, float, bool)): return value elif hasattr(value, "isoformat"): return value.isoformat() elif isinstance(value, (list, tuple)): return [self._convert_value(v) for v in value] elif isinstance(value, dict): return {k: self._convert_value(v) for k, v in value.items()} else: return str(value) def close(self): self._file.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False class PersistMethod: none = 0 sequence = 1 segment = 2 legacy = 3 wikidiff2 = 4 def diff_with_timeout(differ, last_text, text): """Returns (result, timed_out) tuple using native pywikidiff2 timeout.""" result = differ.inline_json_diff(last_text, text, timeout_ms=DIFF_TIMEOUT_MS) return result, differ.timed_out() def calculate_persistence(tokens_added): return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]: i = 0 for rev in revs: if rev.text is None: rev.text = "" if not rev.sha1 and not rev.deleted.text: rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() revs[i] = rev i += 1 return revs class WikiqIterator: def __init__(self, fh, collapse_user=False): self.fh = fh self.collapse_user = collapse_user self.mwiterator = Dump.from_file(self.fh) self.namespace_map = { ns.id: ns.name for ns in self.mwiterator.site_info.namespaces } self.__pages: Generator[WikiqPage] = self.load_pages() def load_pages(self): for page in self.mwiterator: yield WikiqPage( page, namespace_map=self.namespace_map, collapse_user=self.collapse_user ) def __iter__(self): return self.__pages def __next__(self): return next(self.__pages) class WikiqPage: __slots__ = ( "id", "redirect", "restrictions", "mwpage", "__revisions", "collapse_user", ) def __init__(self, page, namespace_map, collapse_user=False): self.id = page.id # following mwxml, we assume namespace 0 in cases where # page.namespace is inconsistent with namespace_map if page.namespace not in namespace_map: page.namespace = 0 if page.namespace != 0: page.title = ":".join([namespace_map[page.namespace], page.title]) self.restrictions = page.restrictions self.collapse_user = collapse_user self.mwpage = page self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list() @staticmethod def user_text(rev) -> Union[str, None]: return None if rev.deleted.user else rev.user.text def rev_list(self): # Outline for how we want to handle collapse_user=True # iteration rev.user prev_rev.user add prev_rev? # 0 A None Never # 1 A A False # 2 B A True # 3 A B True # 4 A A False # Post-loop A Always if not self.collapse_user: for rev in self.mwpage: yield [rev] return for _, revs in groupby(self.mwpage, self.user_text): # All revisions are either from the same user, or this is a single # revision where the user is missing. yield list(revs) def __iter__(self): return self.__revisions def __next__(self): return next(self.__revisions) """ A RegexPair is defined by a regular expression (pattern) and a label. The pattern can include capture groups. If it does then each capture group will have a resulting column in the output. If the pattern does not include a capture group, then only one output column will result. """ class RegexPair(object): def __init__(self, pattern, label): self.pattern = re.compile(pattern) self.label = label self.has_groups = bool(self.pattern.groupindex) if self.has_groups: self.capture_groups = list(self.pattern.groupindex.keys()) def get_pyarrow_fields(self): if self.has_groups: fields = [ pa.field(self._make_key(cap_group), pa.string()) for cap_group in self.capture_groups ] else: fields = [pa.field(self.label, pa.string())] return fields def _make_key(self, cap_group): return "{}_{}".format(self.label, cap_group) def matchmake(self, content: str) -> dict: temp_dict = {} # if there are named capture groups in the regex if self.has_groups: # if there are matches of some sort in this revision content, fill the lists for each cap_group if self.pattern.search(content) is not None: m = self.pattern.finditer(content) matchobjects = list(m) for cap_group in self.capture_groups: key = self._make_key(cap_group) temp_list = [] for match in matchobjects: # we only want to add the match for the capture group if the match is not None if match.group(cap_group) is not None: temp_list.append(match.group(cap_group)) # if temp_list of matches is empty just make that column None if len(temp_list) == 0: temp_dict[key] = None # else we put in the list we made in the for-loop above else: temp_dict[key] = ", ".join(temp_list) # there are no matches at all in this revision content, we default values to None else: for cap_group in self.capture_groups: key = self._make_key(cap_group) temp_dict[key] = None # there are no capture groups, we just search for all the matches of the regex else: # given that there are matches to be made if type(content) in (str, bytes): if self.pattern.search(content) is not None: m = self.pattern.findall(content) temp_dict[self.label] = ", ".join(m) else: temp_dict[self.label] = None return temp_dict class WikiqParser: def __init__( self, input_file: Union[TextIOWrapper, IO[Any], IO[bytes]], output_file: Union[TextIO, str], regex_match_revision: list[str], regex_match_comment: list[str], regex_revision_label: list[str], regex_comment_label: list[str], text: bool = False, diff: bool = False, collapse_user: bool = False, persist: int = None, namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_jsonl: bool = False, output_jsonl_dir: bool = False, output_parquet: bool = False, batch_size: int = 1024, resume_point: Union[tuple, dict, None] = None, partition_namespaces: bool = False, external_links: bool = False, citations: bool = False, wikilinks: bool = False, templates: bool = False, headings: bool = False, time_limit_seconds: Union[float, None] = None, max_revisions_per_file: int = 0, ): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value resume_point : if set, either a (pageid, revid) tuple for single-file output, or a dict mapping namespace -> (pageid, revid) for partitioned output. For single-file: skip all revisions up to and including this point. max_revisions_per_file : if > 0, close and rotate output files after this many revisions """ self.input_file = input_file self.collapse_user: bool = collapse_user self.persist: int = persist self.namespaces = [] self.revert_radius = revert_radius self.diff = diff self.text = text self.partition_namespaces = partition_namespaces self.resume_point = resume_point self.external_links = external_links self.citations = citations self.wikilinks = wikilinks self.templates = templates self.headings = headings self.shutdown_requested = False self.time_limit_seconds = time_limit_seconds self.max_revisions_per_file = max_revisions_per_file if namespaces is not None: self.namespace_filter = set(namespaces) else: self.namespace_filter = None self.regex_schemas = [] self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs( regex_match_revision, regex_revision_label ) self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs( regex_match_comment, regex_comment_label ) # Initialize output self.batch_size = batch_size self.output_jsonl = output_jsonl self.output_jsonl_dir = output_jsonl_dir self.output_parquet = output_parquet self.output_file = output_file if output_parquet: self.pq_writer = None self.parquet_buffer = [] elif output_jsonl: pass # JSONLWriter created in process() else: # TSV output self.print_header = True if output_file == sys.stdout.buffer: pass else: self.output_file = open(output_file, "wb") # Checkpoint for tracking resume point (path only, no open file handle for NFS safety) self.checkpoint_path = None self.checkpoint_state = {} # namespace -> (pageid, revid) or None -> (pageid, revid) def request_shutdown(self): """Request graceful shutdown. The process() method will exit after completing the current batch.""" self.shutdown_requested = True def _time_limit_expired(self): """Timer callback when time limit is reached.""" hours = self.time_limit_seconds / 3600 print(f"Time limit of {hours:.2f} hours reached, requesting shutdown...", file=sys.stderr) self.request_shutdown() def _start_time_limit_timer(self): """Start a background timer to trigger shutdown when time limit is reached.""" if self.time_limit_seconds is None: return None timer = threading.Timer(self.time_limit_seconds, self._time_limit_expired) timer.daemon = True timer.start() return timer def _cancel_time_limit_timer(self, timer): """Cancel the time limit timer if it's still running.""" if timer is not None: timer.cancel() def _get_part_path(self, base_path, part_num): """Generate path with part number inserted before extension. Example: output.parquet -> output.part0.parquet """ path = Path(base_path) return path.parent / f"{path.stem}.part{part_num}{path.suffix}" def _open_checkpoint(self, output_file): """Enable checkpointing for Parquet output only. JSONL doesn't need checkpoint files - resume point is derived from last line. """ if not self.output_parquet or output_file == sys.stdout.buffer: return self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces) Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True) print(f"Checkpoint enabled: {self.checkpoint_path}", file=sys.stderr) def _update_checkpoint(self, pageid, revid, namespace=None, part=0): """Update checkpoint state and write atomically (NFS-safe).""" if self.checkpoint_path is None: return if self.partition_namespaces: self.checkpoint_state[namespace] = {"pageid": pageid, "revid": revid, "part": part} else: self.checkpoint_state = {"pageid": pageid, "revid": revid, "part": part} # Atomic write: write to temp file, then rename temp_path = self.checkpoint_path + ".tmp" with open(temp_path, 'w') as f: json.dump(self.checkpoint_state, f) os.replace(temp_path, self.checkpoint_path) def _close_checkpoint(self, delete=False): """Clean up checkpoint, optionally deleting it.""" if self.checkpoint_path is None: return if delete and os.path.exists(self.checkpoint_path): os.remove(self.checkpoint_path) print(f"Checkpoint deleted: {self.checkpoint_path}", file=sys.stderr) elif os.path.exists(self.checkpoint_path): print(f"Checkpoint preserved for resume: {self.checkpoint_path}", file=sys.stderr) # Clean up any leftover temp file temp_path = self.checkpoint_path + ".tmp" if os.path.exists(temp_path): os.remove(temp_path) def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_base_paths, sorting_cols, namespace=None, part_numbers=None): """Write a batch of rows to the appropriate writer. For partitioned output, creates writer lazily if needed. Returns (writer, num_rows) - writer used and number of rows written. """ num_rows = len(row_buffer.get("revid", [])) if self.partition_namespaces and namespace is not None: if namespace not in pq_writers: base_path = ns_base_paths[namespace] part_num = part_numbers.get(namespace, 0) if part_numbers else 0 if self.max_revisions_per_file > 0: ns_path = self._get_part_path(base_path, part_num) else: ns_path = base_path Path(ns_path).parent.mkdir(exist_ok=True, parents=True) pq_writers[namespace] = pq.ParquetWriter( ns_path, schema, flavor="spark", sorting_columns=sorting_cols ) writer = pq_writers[namespace] writer.write(pa.record_batch(row_buffer, schema=schema)) return writer, num_rows def make_matchmake_pairs(self, patterns, labels) -> list[RegexPair]: if (patterns is not None and labels is not None) and ( len(patterns) == len(labels) ): result: list[RegexPair] = [] for pattern, label in zip(patterns, labels): rp = RegexPair(pattern, label) result.append(rp) self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() return result elif (patterns is None) and (labels is None): return [] else: sys.exit( "Each regular expression *must* come with a corresponding label and vice versa." ) def matchmake_revision(self, rev: mwxml.Revision): result = self.matchmake_text(rev.text) for k, v in self.matchmake_comment(rev.comment).items(): result[k] = v return result def matchmake_text(self, text: str): return self.matchmake_pairs(text, self.regex_revision_pairs) def matchmake_comment(self, comment: str): return self.matchmake_pairs(comment, self.regex_comment_pairs) @staticmethod def matchmake_pairs(text, pairs): result = {} for pair in pairs: for k, v in pair.matchmake(text).items(): result[k] = v return result def __get_namespace_from_title(self, title): default_ns = None for ns in self.namespaces: # skip if the namespace is not defined if ns is None: default_ns = self.namespaces[ns] continue if title.startswith(ns + ":"): return self.namespaces[ns] # if we've made it this far with no matches, we return the default namespace return default_ns def process(self): # Start time limit timer if configured time_limit_timer = self._start_time_limit_timer() # Track whether we've passed the resume point if self.resume_point is None: found_resume_point = True elif self.partition_namespaces: found_resume_point = {} else: found_resume_point = False # When resuming with parquet, write new data to temp file/directory and merge at the end original_output_file = None temp_output_file = None original_partition_dir = None if self.resume_point is not None and self.output_parquet: original_output_file, temp_output_file, original_partition_dir = \ setup_resume_temp_output(self.output_file, self.partition_namespaces) if temp_output_file is not None: self.output_file = temp_output_file # Open checkpoint file for tracking resume point checkpoint_output = original_output_file if original_output_file else self.output_file self._open_checkpoint(checkpoint_output) # Construct dump file iterator dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) table, reverts_column = build_table( text=self.text, collapse_user=self.collapse_user, external_links=self.external_links, citations=self.citations, wikilinks=self.wikilinks, templates=self.templates, headings=self.headings, ) # Extract list of namespaces self.namespaces = { ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces } page_count = 0 rev_count = 0 schema = build_schema( table, diff=self.diff, persist=self.persist, text=self.text, regex_revision_pairs=self.regex_revision_pairs, regex_comment_pairs=self.regex_comment_pairs, ) # Add RevisionText to table for diff/persist computation (extraction only, not output) if (self.diff or self.persist != PersistMethod.none) and not self.text: table.columns.append(tables.RevisionText()) # Initialize writer writer = None sorting_cols = None ns_base_paths = {} pq_writers = {} part_numbers = {} if self.output_parquet: pageid_sortingcol = pq.SortingColumn(schema.get_field_index("articleid")) revid_sortingcol = pq.SortingColumn(schema.get_field_index("revid")) sorting_cols = [pageid_sortingcol, revid_sortingcol] if self.resume_point is not None: if self.partition_namespaces: for ns, resume_data in self.resume_point.items(): part_numbers[ns] = resume_data[2] if len(resume_data) > 2 else 0 else: part_numbers[None] = self.resume_point[2] if len(self.resume_point) > 2 else 0 if not self.partition_namespaces: if self.max_revisions_per_file > 0: output_path_with_part = self._get_part_path(self.output_file, part_numbers.get(None, 0)) else: output_path_with_part = self.output_file writer = pq.ParquetWriter( output_path_with_part, schema, flavor="spark", sorting_columns=sorting_cols, ) else: output_path = Path(self.output_file) if self.namespace_filter is not None: namespaces = self.namespace_filter else: namespaces = self.namespaces.values() ns_base_paths = { ns: (output_path.parent / f"namespace={ns}") / output_path.name for ns in namespaces } for ns in namespaces: if ns not in part_numbers: part_numbers[ns] = 0 elif self.output_jsonl: append_mode = self.resume_point is not None if self.output_jsonl_dir: # Create directory for JSONL output Path(self.output_file).mkdir(parents=True, exist_ok=True) part_num = 0 if self.resume_point is not None and len(self.resume_point) > 2: part_num = self.resume_point[2] part_numbers[None] = part_num jsonl_path = self._get_part_path( Path(self.output_file) / "data.jsonl", part_num ) writer = JSONLWriter(str(jsonl_path), schema, append=append_mode) else: writer = JSONLWriter(self.output_file, schema, append=append_mode) else: writer = pacsv.CSVWriter( self.output_file, schema, write_options=pacsv.WriteOptions(delimiter="\t"), ) # Initialize diff machinery differ = None fast_differ = None if self.diff: differ = pywikidiff2.pywikidiff2( num_context_lines=1000000, max_word_level_diff_complexity=-1, moved_paragraph_detection_cutoff=-1, words_cache_capacity=10000, diff_cache_capacity=10000, stats_cache_capacity=10000, ) fast_differ = pywikidiff2.pywikidiff2( num_context_lines=1000000, max_word_level_diff_complexity=40000000, moved_paragraph_detection_cutoff=100, words_cache_capacity=-1, diff_cache_capacity=-1, stats_cache_capacity=-1, ) # Write buffer: accumulate rows before flushing write_buffer = defaultdict(list) buffer_count = 0 last_namespace = None def flush_buffer(): nonlocal write_buffer, buffer_count, last_namespace if buffer_count == 0: return row_buffer = dict(write_buffer) namespace = last_namespace if self.output_parquet: if self.partition_namespaces: self._write_batch( row_buffer, schema, writer, pq_writers, ns_base_paths, sorting_cols, namespace=namespace, part_numbers=part_numbers ) else: writer.write(pa.record_batch(row_buffer, schema=schema)) elif self.output_jsonl: writer.write_batch(row_buffer) else: writer.write(pa.record_batch(row_buffer, schema=schema)) # Update checkpoint last_pageid = row_buffer["articleid"][-1] last_revid = row_buffer["revid"][-1] part = part_numbers.get(namespace if self.partition_namespaces else None, 0) self._update_checkpoint(last_pageid, last_revid, namespace=namespace if self.partition_namespaces else None, part=part) write_buffer = defaultdict(list) buffer_count = 0 # Iterate through pages for page in dump: # Skip namespaces not in the filter if self.namespace_filter is not None: if page.mwpage.namespace not in self.namespace_filter: continue # Resume logic: skip pages before the resume point is_resume_page = False page_resume_revid = None if self.resume_point is not None and not found_resume_point: page_id = page.mwpage.id resume_pageid, resume_revid = self.resume_point[0], self.resume_point[1] if page_id < resume_pageid: continue elif page_id == resume_pageid: is_resume_page = True page_resume_revid = resume_revid else: found_resume_point = True # Reset revert detector for new page if self.revert_radius > 0: reverts_column.rev_detector = mwreverts.Detector(radius=self.revert_radius) else: reverts_column.rev_detector = None # State for this page prev_text = "" persist_state = None persist_window = None if self.persist != PersistMethod.none: persist_window = deque(maxlen=PERSISTENCE_RADIUS) if self.persist == PersistMethod.sequence: persist_state = mwpersistence.DiffState( SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS, ) elif self.persist == PersistMethod.segment: persist_state = mwpersistence.DiffState( SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS, ) elif self.persist == PersistMethod.wikidiff2: wikidiff_matcher = WikiDiffMatcher(tokenizer=wikitext_split) persist_state = mwpersistence.DiffState( wikidiff_matcher, revert_radius=PERSISTENCE_RADIUS ) else: from mw.lib import persistence persist_state = persistence.State() # Pending persistence values waiting for window to fill pending_persistence = [] # Use peekable to detect last revision in page revs_iter = peekable(page) for revs in revs_iter: # revs is either a single revision or a group (collapse_user mode) revs = list(revs) revs = fix_hex_digests(revs) rev = revs[-1] # Last revision in the group is_last_in_page = revs_iter.peek(None) is None # Skip revisions before resume point if is_resume_page: if rev.id <= page_resume_revid: # Update state for correctness when we resume output if self.diff or self.persist != PersistMethod.none: prev_text = rev.text or "" if persist_state is not None: text = rev.text or "" if self.persist != PersistMethod.legacy: persist_state.update(text, rev.id) else: persist_state.process(text, rev.id) # Update revert detector so it has history for post-resume revisions if reverts_column.rev_detector is not None and not rev.deleted.text: reverts_column.rev_detector.process(rev.sha1, rev.id) if rev.id == page_resume_revid: found_resume_point = True is_resume_page = False print(f"Resuming output after revid {rev.id}", file=sys.stderr) continue rev_count += 1 # Extract base row data row = table.extract_row(page.mwpage, revs) # Compute revert flag if self.revert_radius == 0 or row["deleted"]: row["revert"] = None else: row["revert"] = row["reverteds"] is not None # Regex matching regex_dict = self.matchmake_revision(rev) for k, v in regex_dict.items(): row[k] = v # Compute diff text = row.get("text", "") or "" if self.diff: diff_result, timed_out = diff_with_timeout(differ, prev_text, text) if timed_out: print(f"WARNING! wikidiff2 timeout for rev: {rev.id}. Falling back to default limits.", file=sys.stderr) diff_result = fast_differ.inline_json_diff(prev_text, text) row["diff"] = [entry for entry in json.loads(diff_result)["diff"] if entry["type"] != 0] row["diff_timeout"] = timed_out # Compute persistence if persist_state is not None: if self.persist != PersistMethod.legacy: _, tokens_added, tokens_removed = persist_state.update(text, rev.id) else: _, tokens_added, tokens_removed = persist_state.process(text, rev.id) persist_window.append((rev.id, tokens_added, tokens_removed)) pending_persistence.append(row) # When window is full, emit persistence for oldest revision if len(persist_window) == PERSISTENCE_RADIUS: old_rev_id, old_tokens_added, old_tokens_removed = persist_window.popleft() oldest_row = pending_persistence.pop(0) num_token_revs, num_tokens = calculate_persistence(old_tokens_added) oldest_row["token_revs"] = num_token_revs oldest_row["tokens_added"] = num_tokens oldest_row["tokens_removed"] = len(old_tokens_removed) oldest_row["tokens_window"] = PERSISTENCE_RADIUS - 1 # Remove text if not outputting it if not self.text and "text" in oldest_row: del oldest_row["text"] # Add to write buffer for k, v in oldest_row.items(): write_buffer[k].append(v) buffer_count += 1 last_namespace = page.mwpage.namespace if buffer_count >= self.batch_size: flush_buffer() # Update prev_text for next iteration if self.diff or self.persist != PersistMethod.none: prev_text = text # If no persistence, write row directly if persist_state is None: if not self.text and "text" in row: del row["text"] for k, v in row.items(): write_buffer[k].append(v) buffer_count += 1 last_namespace = page.mwpage.namespace if buffer_count >= self.batch_size: flush_buffer() # Check for shutdown if self.shutdown_requested: print("Shutdown requested, closing writers...", file=sys.stderr) break # End of page: flush remaining persistence window if persist_state is not None and not self.shutdown_requested: for i, (pending_row, window_item) in enumerate(zip(pending_persistence, persist_window)): rev_id, tokens_added, tokens_removed = window_item num_token_revs, num_tokens = calculate_persistence(tokens_added) pending_row["token_revs"] = num_token_revs pending_row["tokens_added"] = num_tokens pending_row["tokens_removed"] = len(tokens_removed) pending_row["tokens_window"] = len(persist_window) - (i + 1) if not self.text and "text" in pending_row: del pending_row["text"] for k, v in pending_row.items(): write_buffer[k].append(v) buffer_count += 1 last_namespace = page.mwpage.namespace if self.shutdown_requested: break page_count += 1 # Flush remaining buffer flush_buffer() # Cancel time limit timer self._cancel_time_limit_timer(time_limit_timer) print( "Done: %s revisions and %s pages." % (rev_count, page_count), file=sys.stderr, ) # Close all writers if self.output_parquet and self.partition_namespaces: for pq_writer in pq_writers.values(): pq_writer.close() elif writer is not None: writer.close() # Close checkpoint file; delete it only if we completed without interruption self._close_checkpoint(delete=not self.shutdown_requested) # Merge temp output with original for parquet resume if original_output_file is not None and temp_output_file is not None: finalize_resume_merge( original_output_file, temp_output_file, self.partition_namespaces, original_partition_dir ) def match_archive_suffix(input_filename): if re.match(r".*\.7z$", input_filename): cmd = ["7za", "x", "-so", input_filename] elif re.match(r".*\.gz$", input_filename): cmd = ["zcat", input_filename] elif re.match(r".*\.bz2$", input_filename): cmd = ["bzcat", "-dk", input_filename] else: raise ValueError("Unrecognized file type: %s" % input_filename) return cmd def open_input_file(input_filename, fandom_2020=False): cmd = match_archive_suffix(input_filename) if fandom_2020: cmd.append("*.xml") try: return Popen(cmd, stdout=PIPE).stdout except NameError: return open(input_filename, "r") def get_output_filename(input_filename, output_format='tsv') -> str: """Generate output filename based on input filename and format. Args: input_filename: Input dump file path output_format: 'tsv', 'jsonl', or 'parquet' """ output_filename = re.sub(r"\.(7z|gz|bz2)?$", "", input_filename) output_filename = re.sub(r"\.xml", "", output_filename) if output_format == 'jsonl': output_filename = output_filename + ".jsonl" elif output_format == 'parquet': output_filename = output_filename + ".parquet" else: output_filename = output_filename + ".tsv" return output_filename def main(): parser = argparse.ArgumentParser( description="Parse MediaWiki XML database dumps into tab delimited data." ) # arguments for the input direction parser.add_argument( "dumpfiles", metavar="DUMPFILE", nargs="*", type=str, help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.", ) parser.add_argument( "-o", "--output", metavar="OUTPUT", dest="output", type=str, nargs=1, help="Output file or directory. Format is detected from extension: .jsonl for JSONL, .parquet for Parquet, otherwise TSV.", ) parser.add_argument( "-s", "--stdout", dest="stdout", action="store_true", help="Write output to standard out (do not create dump file)", ) parser.add_argument( "--print-schema", dest="print_schema", action="store_true", help="Print the Spark-compatible JSON schema for the output and exit. No dump file is processed.", ) parser.add_argument( "--collapse-user", dest="collapse_user", action="store_true", help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.", ) parser.add_argument( "-p", "--persistence", dest="persist", default=None, const="", type=str, choices=["", "wikidiff2", "segment", "sequence", "legacy"], nargs="?", help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.", ) parser.add_argument( "-n", "--namespace-include", dest="namespace_filter", type=int, action="append", help="Id number of namespace to include. Can be specified more than once.", ) parser.add_argument( "-rr", "--revert-radius", dest="revert_radius", type=int, action="store", default=15, help="Number of edits to check when looking for reverts (default: 15)", ) parser.add_argument( "-RP", "--revision-pattern", dest="regex_match_revision", default=None, type=str, action="append", help="The regular expression to search for in revision text. The regex must be surrounded by quotes.", ) parser.add_argument( "-RPl", "--revision-pattern-label", dest="regex_revision_label", default=None, type=str, action="append", help="The label for the outputted column based on matching the regex in revision text.", ) parser.add_argument( "-CP", "--comment-pattern", dest="regex_match_comment", default=None, type=str, action="append", help="The regular expression to search for in comments of revisions.", ) parser.add_argument( "-CPl", "--comment-pattern-label", dest="regex_comment_label", default=None, type=str, action="append", help="The label for the outputted column based on matching the regex in comments.", ) parser.add_argument( "-d", "--diff", dest="diff", default=False, action="store_true", help="Output a diff structure for each revision with information about changed or moved lines.", ) parser.add_argument( "-t", "--text", dest="text", default=False, action="store_true", help="Output the text of the revision.", ) parser.add_argument( "--external-links", dest="external_links", action="store_true", default=False, help="Extract external links from each revision using mwparserfromhell.", ) parser.add_argument( "--citations", dest="citations", action="store_true", default=False, help="Extract citations (ref tags and cite templates) from each revision.", ) parser.add_argument( "--wikilinks", dest="wikilinks", action="store_true", default=False, help="Extract internal wikilinks from each revision.", ) parser.add_argument( "--templates", dest="templates", action="store_true", default=False, help="Extract templates with their parameters from each revision.", ) parser.add_argument( "--headings", dest="headings", action="store_true", default=False, help="Extract section headings from each revision.", ) parser.add_argument( "--fandom-2020", dest="fandom_2020", action="store_true", help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.", ) parser.add_argument( "--batch-size", dest="batch_size", default=1500, type=int, help="How many revisions to process in each batch. This ends up being the Parquet row group size", ) parser.add_argument( "--resume", dest="resume", action="store_true", help="Resume processing from the last successfully written revision in the output file.", ) parser.add_argument( "--time-limit", dest="time_limit", type=float, default=0, help="Time limit in hours before graceful shutdown. Set to 0 to disable (default).", ) parser.add_argument( "--partition-namespaces", dest="partition_namespaces", action="store_true", default=False, help="For Parquet output, partition output by namespace into separate files.", ) parser.add_argument( "--max-revisions-per-file", dest="max_revisions_per_file", type=int, default=0, help="For Parquet output, split output into multiple files after this many revisions. Set to 0 to disable (default).", ) args = parser.parse_args() # set persistence method if args.persist is None: persist = PersistMethod.none elif args.persist == "segment": persist = PersistMethod.segment elif args.persist == "legacy": persist = PersistMethod.legacy elif args.persist == "wikidiff2": persist = PersistMethod.wikidiff2 else: persist = PersistMethod.sequence if args.namespace_filter is not None: namespaces = args.namespace_filter else: namespaces = None # Handle --print-schema: build and output schema, then exit if args.print_schema: regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label) regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label) table, _ = build_table( text=args.text, collapse_user=args.collapse_user, external_links=args.external_links, citations=args.citations, wikilinks=args.wikilinks, templates=args.templates, headings=args.headings, ) schema = build_schema( table, diff=args.diff, persist=persist, text=args.text, regex_revision_pairs=regex_revision_pairs, regex_comment_pairs=regex_comment_pairs, ) spark_schema = pyarrow_to_spark_schema(schema) print(json.dumps(spark_schema, indent=2)) sys.exit(0) print(args, file=sys.stderr) if len(args.dumpfiles) > 0: for filename in args.dumpfiles: # Determine output file path before opening input (so resume errors are caught early) if args.output: output = args.output[0] else: output = "." # Detect output format from extension output_jsonl_dir = output.endswith(".jsonl.d") output_jsonl = output.endswith(".jsonl") or output_jsonl_dir output_parquet = output.endswith(".parquet") partition_namespaces = args.partition_namespaces and output_parquet if args.stdout: output_file = sys.stdout.buffer elif output_jsonl or output_parquet: # Output is a JSONL or Parquet file path - use it directly output_file = output elif os.path.isdir(output): # Output is a directory - derive filename from input output_filename = os.path.join(output, os.path.basename(filename)) output_file = get_output_filename(output_filename, output_format='tsv') else: output_file = output # Handle resume functionality before opening input file resume_point = None if args.resume: if (output_jsonl or output_parquet) and not args.stdout: # Clean up any interrupted resume from previous run if output_parquet: cleanup_result = cleanup_interrupted_resume(output_file, partition_namespaces) if cleanup_result == "start_fresh": resume_point = None else: resume_point = get_resume_point(output_file, partition_namespaces) else: # JSONL: get resume point from last line of file (no checkpoint) resume_point = get_resume_point(output_file, input_file=filename) if resume_point is not None: if isinstance(resume_point, dict): print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr) else: pageid, revid = resume_point[0], resume_point[1] print(f"Resuming from checkpoint: pageid={pageid}, revid={revid}", file=sys.stderr) else: sys.exit("Error: --resume only works with JSONL or Parquet output (not stdout or TSV)") # Now open the input file print("Processing file: %s" % filename, file=sys.stderr) input_file = open_input_file(filename, args.fandom_2020) time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None wikiq = WikiqParser( input_file, output_file, collapse_user=args.collapse_user, persist=persist, namespaces=namespaces, revert_radius=args.revert_radius, regex_match_revision=args.regex_match_revision, regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, text=args.text, diff=args.diff, output_jsonl=output_jsonl, output_jsonl_dir=output_jsonl_dir, output_parquet=output_parquet, partition_namespaces=partition_namespaces, batch_size=args.batch_size, resume_point=resume_point, external_links=args.external_links, citations=args.citations, wikilinks=args.wikilinks, templates=args.templates, headings=args.headings, time_limit_seconds=time_limit_seconds, max_revisions_per_file=args.max_revisions_per_file, ) # Register signal handlers for graceful shutdown (CLI only) def handle_shutdown(signum, frame): sig_name = signal.Signals(signum).name print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr) wikiq.request_shutdown() original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown) original_sigint = signal.signal(signal.SIGINT, handle_shutdown) original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown) original_sigusr2 = signal.signal(signal.SIGUSR2, handle_shutdown) try: wikiq.process() finally: # Restore original signal handlers signal.signal(signal.SIGTERM, original_sigterm) signal.signal(signal.SIGINT, original_sigint) signal.signal(signal.SIGUSR1, original_sigusr1) signal.signal(signal.SIGUSR2, original_sigusr2) # close things input_file.close() else: if args.resume: print("Warning: --resume cannot be used with stdin/stdout", file=sys.stderr) time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None wikiq = WikiqParser( sys.stdin, sys.stdout, collapse_user=args.collapse_user, persist=persist, # persist_legacy=args.persist_legacy, namespaces=namespaces, revert_radius=args.revert_radius, regex_match_revision=args.regex_match_revision, regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, diff=args.diff, text=args.text, batch_size=args.batch_size, resume_point=None, external_links=args.external_links, citations=args.citations, wikilinks=args.wikilinks, templates=args.templates, headings=args.headings, time_limit_seconds=time_limit_seconds, ) # Register signal handlers for graceful shutdown (CLI only) def handle_shutdown(signum, frame): sig_name = signal.Signals(signum).name print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr) wikiq.request_shutdown() original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown) original_sigint = signal.signal(signal.SIGINT, handle_shutdown) original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown) try: wikiq.process() finally: # Restore original signal handlers signal.signal(signal.SIGTERM, original_sigterm) signal.signal(signal.SIGINT, original_sigint) signal.signal(signal.SIGUSR1, original_sigusr1) # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" # stop_words = stop_words.split(",") if __name__ == "__main__": main()