Files
mediawiki_dump_tools/src/wikiq/__init__.py
2025-12-23 09:09:51 -08:00

1609 lines
60 KiB
Python
Executable File

#!/usr/bin/env python3
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import argparse
import gc
import json
import os.path
import re
import signal
import sys
import threading
import time
from collections import deque, defaultdict
from hashlib import sha1
from io import TextIOWrapper
from itertools import groupby
from subprocess import PIPE, Popen
from typing import IO, Any, Generator, TextIO, Union
import mwpersistence
import mwreverts
import mwxml
import pywikidiff2
from deltas.tokenizers import wikitext_split
from more_itertools import peekable
from mwxml import Dump
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
from wikiq.wikitext_parser import WikitextParser
from wikiq.resume import (
get_checkpoint_path,
read_checkpoint,
get_resume_point,
setup_resume_temp_output,
finalize_resume_merge,
cleanup_interrupted_resume,
)
TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
DIFF_TIMEOUT_MS = 60000
from pathlib import Path
import pyarrow as pa
import pyarrow.csv as pacsv
import pyarrow.parquet as pq
from deltas import SegmentMatcher, SequenceMatcher
def pyarrow_type_to_spark(pa_type):
"""Convert a PyArrow type to Spark JSON schema format."""
if pa.types.is_int64(pa_type):
return "long"
elif pa.types.is_int32(pa_type):
return "integer"
elif pa.types.is_int8(pa_type):
return "byte"
elif pa.types.is_boolean(pa_type):
return "boolean"
elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
return "string"
elif pa.types.is_timestamp(pa_type):
return "timestamp"
elif pa.types.is_list(pa_type):
return {
"type": "array",
"elementType": pyarrow_type_to_spark(pa_type.value_type),
"containsNull": True
}
elif pa.types.is_struct(pa_type):
return {
"type": "struct",
"fields": [
{
"name": field.name,
"type": pyarrow_type_to_spark(field.type),
"nullable": field.nullable,
"metadata": {}
}
for field in pa_type
]
}
elif pa.types.is_map(pa_type):
return {
"type": "map",
"keyType": pyarrow_type_to_spark(pa_type.key_type),
"valueType": pyarrow_type_to_spark(pa_type.item_type),
"valueContainsNull": True
}
else:
return "string"
def pyarrow_to_spark_schema(schema: pa.Schema) -> dict:
"""Convert a PyArrow schema to Spark JSON schema format."""
return {
"type": "struct",
"fields": [
{
"name": field.name,
"type": pyarrow_type_to_spark(field.type),
"nullable": field.nullable,
"metadata": {}
}
for field in schema
]
}
def build_table(
text: bool = False,
collapse_user: bool = False,
external_links: bool = False,
citations: bool = False,
wikilinks: bool = False,
templates: bool = False,
headings: bool = False,
):
"""Build the RevisionTable with appropriate columns based on flags.
Returns:
(table, reverts_column) - the table and a reference to the reverts column
(which process() needs for setting the revert detector).
"""
reverts_column = tables.RevisionReverts()
table = RevisionTable([
tables.RevisionId(),
tables.RevisionTimestamp(),
tables.RevisionArticleId(),
tables.RevisionPageTitle(),
tables.RevisionNamespace(),
tables.RevisionDeleted(),
tables.RevisionEditorId(),
tables.RevisionEditSummary(),
tables.RevisionTextChars(),
reverts_column,
tables.RevisionSha1(),
tables.RevisionIsMinor(),
tables.RevisionEditorText(),
tables.RevisionIsAnon(),
])
if text:
table.columns.append(tables.RevisionText())
if collapse_user:
table.columns.append(tables.RevisionCollapsed())
if external_links or citations or wikilinks or templates or headings:
wikitext_parser = WikitextParser()
if external_links:
table.columns.append(tables.RevisionExternalLinks(wikitext_parser))
if citations:
table.columns.append(tables.RevisionCitations(wikitext_parser))
if wikilinks:
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
if templates:
table.columns.append(tables.RevisionTemplates(wikitext_parser))
if headings:
table.columns.append(tables.RevisionHeadings(wikitext_parser))
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
return table, reverts_column
def build_schema(
table,
diff: bool = False,
persist: int = 0,
text: bool = False,
regex_revision_pairs: list = None,
regex_comment_pairs: list = None,
) -> pa.Schema:
"""Build the PyArrow schema from a table, adding output-only fields."""
schema = table.schema()
schema = schema.append(pa.field("revert", pa.bool_(), nullable=True))
if diff:
from wikiq.diff_pyarrow_schema import diff_field
schema = schema.append(diff_field)
schema = schema.append(pa.field("diff_timeout", pa.bool_()))
if regex_revision_pairs:
for pair in regex_revision_pairs:
for field in pair.get_pyarrow_fields():
schema = schema.append(field)
if regex_comment_pairs:
for pair in regex_comment_pairs:
for field in pair.get_pyarrow_fields():
schema = schema.append(field)
if persist != PersistMethod.none:
# RevisionText is added to the table for extraction, but not to schema
# (unless text=True, in which case it's already in the schema from build_table)
schema = schema.append(pa.field("token_revs", pa.int64(), nullable=True))
schema = schema.append(pa.field("tokens_added", pa.int64(), nullable=True))
schema = schema.append(pa.field("tokens_removed", pa.int64(), nullable=True))
schema = schema.append(pa.field("tokens_window", pa.int64(), nullable=True))
return schema
def make_regex_pairs(patterns, labels) -> list:
"""Create RegexPair objects from patterns and labels."""
if (patterns is not None and labels is not None) and (len(patterns) == len(labels)):
return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
elif patterns is None and labels is None:
return []
else:
sys.exit("Each regular expression *must* come with a corresponding label and vice versa.")
class JSONLWriter:
"""Write JSONL output with schema validation."""
def __init__(self, output_file: str, schema: pa.Schema, append: bool = False):
self.output_file = output_file
self.schema = schema
self.field_names = [field.name for field in schema]
if append and os.path.exists(output_file):
self._validate_and_fix_last_line(output_file)
mode = "a" if append else "w"
self._file = open(output_file, mode)
def _validate_and_fix_last_line(self, filepath: str):
"""Validate the last line of JSONL file; truncate if corrupted.
If the previous run was interrupted mid-write, the last line may be
incomplete JSON. This detects and removes such corrupted lines.
"""
with open(filepath, 'rb') as f:
f.seek(0, 2)
file_size = f.tell()
if file_size == 0:
return
# Read backwards to find the last newline
chunk_size = min(8192, file_size)
f.seek(-chunk_size, 2)
chunk = f.read(chunk_size)
# Find the last complete line
last_newline = chunk.rfind(b'\n')
if last_newline == -1:
# Entire file is one line (possibly corrupted)
last_line = chunk.decode('utf-8', errors='replace')
truncate_pos = 0
else:
last_line = chunk[last_newline + 1:].decode('utf-8', errors='replace')
truncate_pos = file_size - chunk_size + last_newline + 1
# If last line is empty, file ends with newline - that's fine
if not last_line.strip():
return
# Try to parse the last line as JSON
try:
json.loads(last_line)
except json.JSONDecodeError:
print(f"Warning: Last line of {filepath} is corrupted JSON, removing it",
file=sys.stderr)
# Truncate the file to remove the corrupted last line
with open(filepath, 'r+b') as f:
f.truncate(truncate_pos)
def write_batch(self, data: dict):
"""Write a batch of rows as JSONL.
Args:
data: dict mapping column names to lists of values
"""
if not data or not data.get(self.field_names[0]):
return
num_rows = len(data[self.field_names[0]])
for i in range(num_rows):
row = {}
for name in self.field_names:
if name in data:
value = data[name][i]
row[name] = self._convert_value(value)
self._file.write(json.dumps(row) + "\n")
def _convert_value(self, value):
"""Convert a value to JSON-serializable format."""
if value is None:
return None
elif isinstance(value, (str, int, float, bool)):
return value
elif hasattr(value, "isoformat"):
return value.isoformat()
elif isinstance(value, (list, tuple)):
return [self._convert_value(v) for v in value]
elif isinstance(value, dict):
return {k: self._convert_value(v) for k, v in value.items()}
else:
return str(value)
def close(self):
self._file.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False
class PersistMethod:
none = 0
sequence = 1
segment = 2
legacy = 3
wikidiff2 = 4
def diff_with_timeout(differ, last_text, text):
"""Returns (result, timed_out) tuple using native pywikidiff2 timeout."""
result = differ.inline_json_diff(last_text, text, timeout_ms=DIFF_TIMEOUT_MS)
return result, differ.timed_out()
def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]:
i = 0
for rev in revs:
if rev.text is None:
rev.text = ""
if not rev.sha1 and not rev.deleted.text:
rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
revs[i] = rev
i += 1
return revs
class WikiqIterator:
def __init__(self, fh, collapse_user=False):
self.fh = fh
self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = {
ns.id: ns.name for ns in self.mwiterator.site_info.namespaces
}
self.__pages: Generator[WikiqPage] = self.load_pages()
def load_pages(self):
for page in self.mwiterator:
yield WikiqPage(
page, namespace_map=self.namespace_map, collapse_user=self.collapse_user
)
def __iter__(self):
return self.__pages
def __next__(self):
return next(self.__pages)
class WikiqPage:
__slots__ = (
"id",
"redirect",
"restrictions",
"mwpage",
"__revisions",
"collapse_user",
)
def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id
# following mwxml, we assume namespace 0 in cases where
# page.namespace is inconsistent with namespace_map
if page.namespace not in namespace_map:
page.namespace = 0
if page.namespace != 0:
page.title = ":".join([namespace_map[page.namespace], page.title])
self.restrictions = page.restrictions
self.collapse_user = collapse_user
self.mwpage = page
self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list()
@staticmethod
def user_text(rev) -> Union[str, None]:
return None if rev.deleted.user else rev.user.text
def rev_list(self):
# Outline for how we want to handle collapse_user=True
# iteration rev.user prev_rev.user add prev_rev?
# 0 A None Never
# 1 A A False
# 2 B A True
# 3 A B True
# 4 A A False
# Post-loop A Always
if not self.collapse_user:
for rev in self.mwpage:
yield [rev]
return
for _, revs in groupby(self.mwpage, self.user_text):
# All revisions are either from the same user, or this is a single
# revision where the user is missing.
yield list(revs)
def __iter__(self):
return self.__revisions
def __next__(self):
return next(self.__revisions)
"""
A RegexPair is defined by a regular expression (pattern) and a label.
The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
If the pattern does not include a capture group, then only one output column will result.
"""
class RegexPair(object):
def __init__(self, pattern, label):
self.pattern = re.compile(pattern)
self.label = label
self.has_groups = bool(self.pattern.groupindex)
if self.has_groups:
self.capture_groups = list(self.pattern.groupindex.keys())
def get_pyarrow_fields(self):
if self.has_groups:
fields = [
pa.field(self._make_key(cap_group), pa.string())
for cap_group in self.capture_groups
]
else:
fields = [pa.field(self.label, pa.string())]
return fields
def _make_key(self, cap_group):
return "{}_{}".format(self.label, cap_group)
def matchmake(self, content: str) -> dict:
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
# if there are matches of some sort in this revision content, fill the lists for each cap_group
if self.pattern.search(content) is not None:
m = self.pattern.finditer(content)
matchobjects = list(m)
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
temp_list = []
for match in matchobjects:
# we only want to add the match for the capture group if the match is not None
if match.group(cap_group) is not None:
temp_list.append(match.group(cap_group))
# if temp_list of matches is empty just make that column None
if len(temp_list) == 0:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
temp_dict[key] = ", ".join(temp_list)
# there are no matches at all in this revision content, we default values to None
else:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex
else:
# given that there are matches to be made
if type(content) in (str, bytes):
if self.pattern.search(content) is not None:
m = self.pattern.findall(content)
temp_dict[self.label] = ", ".join(m)
else:
temp_dict[self.label] = None
return temp_dict
class WikiqParser:
def __init__(
self,
input_file: Union[TextIOWrapper, IO[Any], IO[bytes]],
output_file: Union[TextIO, str],
regex_match_revision: list[str],
regex_match_comment: list[str],
regex_revision_label: list[str],
regex_comment_label: list[str],
text: bool = False,
diff: bool = False,
collapse_user: bool = False,
persist: int = None,
namespaces: Union[list[int], None] = None,
revert_radius: int = 15,
output_jsonl: bool = False,
output_jsonl_dir: bool = False,
output_parquet: bool = False,
batch_size: int = 1024,
resume_point: Union[tuple, dict, None] = None,
partition_namespaces: bool = False,
external_links: bool = False,
citations: bool = False,
wikilinks: bool = False,
templates: bool = False,
headings: bool = False,
time_limit_seconds: Union[float, None] = None,
max_revisions_per_file: int = 0,
):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
resume_point : if set, either a (pageid, revid) tuple for single-file output,
or a dict mapping namespace -> (pageid, revid) for partitioned output.
For single-file: skip all revisions up to and including this point.
max_revisions_per_file : if > 0, close and rotate output files after this many revisions
"""
self.input_file = input_file
self.collapse_user: bool = collapse_user
self.persist: int = persist
self.namespaces = []
self.revert_radius = revert_radius
self.diff = diff
self.text = text
self.partition_namespaces = partition_namespaces
self.resume_point = resume_point
self.external_links = external_links
self.citations = citations
self.wikilinks = wikilinks
self.templates = templates
self.headings = headings
self.shutdown_requested = False
self.time_limit_seconds = time_limit_seconds
self.max_revisions_per_file = max_revisions_per_file
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
self.namespace_filter = None
self.regex_schemas = []
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(
regex_match_revision, regex_revision_label
)
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(
regex_match_comment, regex_comment_label
)
# Initialize output
self.batch_size = batch_size
self.output_jsonl = output_jsonl
self.output_jsonl_dir = output_jsonl_dir
self.output_parquet = output_parquet
self.output_file = output_file
if output_parquet:
self.pq_writer = None
self.parquet_buffer = []
elif output_jsonl:
pass # JSONLWriter created in process()
else:
# TSV output
self.print_header = True
if output_file == sys.stdout.buffer:
pass
else:
self.output_file = open(output_file, "wb")
# Checkpoint for tracking resume point (path only, no open file handle for NFS safety)
self.checkpoint_path = None
self.checkpoint_state = {} # namespace -> (pageid, revid) or None -> (pageid, revid)
def request_shutdown(self):
"""Request graceful shutdown. The process() method will exit after completing the current batch."""
self.shutdown_requested = True
def _time_limit_expired(self):
"""Timer callback when time limit is reached."""
hours = self.time_limit_seconds / 3600
print(f"Time limit of {hours:.2f} hours reached, requesting shutdown...", file=sys.stderr)
self.request_shutdown()
def _start_time_limit_timer(self):
"""Start a background timer to trigger shutdown when time limit is reached."""
if self.time_limit_seconds is None:
return None
timer = threading.Timer(self.time_limit_seconds, self._time_limit_expired)
timer.daemon = True
timer.start()
return timer
def _cancel_time_limit_timer(self, timer):
"""Cancel the time limit timer if it's still running."""
if timer is not None:
timer.cancel()
def _get_part_path(self, base_path, part_num):
"""Generate path with part number inserted before extension.
Example: output.parquet -> output.part0.parquet
"""
path = Path(base_path)
return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
def _open_checkpoint(self, output_file):
"""Enable checkpointing for Parquet output only.
JSONL doesn't need checkpoint files - resume point is derived from last line.
"""
if not self.output_parquet or output_file == sys.stdout.buffer:
return
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
print(f"Checkpoint enabled: {self.checkpoint_path}", file=sys.stderr)
def _update_checkpoint(self, pageid, revid, namespace=None, part=0):
"""Update checkpoint state and write atomically (NFS-safe)."""
if self.checkpoint_path is None:
return
if self.partition_namespaces:
self.checkpoint_state[namespace] = {"pageid": pageid, "revid": revid, "part": part}
else:
self.checkpoint_state = {"pageid": pageid, "revid": revid, "part": part}
# Atomic write: write to temp file, then rename
temp_path = self.checkpoint_path + ".tmp"
with open(temp_path, 'w') as f:
json.dump(self.checkpoint_state, f)
os.replace(temp_path, self.checkpoint_path)
def _close_checkpoint(self, delete=False):
"""Clean up checkpoint, optionally deleting it."""
if self.checkpoint_path is None:
return
if delete and os.path.exists(self.checkpoint_path):
os.remove(self.checkpoint_path)
print(f"Checkpoint deleted: {self.checkpoint_path}", file=sys.stderr)
elif os.path.exists(self.checkpoint_path):
print(f"Checkpoint preserved for resume: {self.checkpoint_path}", file=sys.stderr)
# Clean up any leftover temp file
temp_path = self.checkpoint_path + ".tmp"
if os.path.exists(temp_path):
os.remove(temp_path)
def _write_batch(self, row_buffer, schema, writer, pq_writers, ns_base_paths, sorting_cols, namespace=None, part_numbers=None):
"""Write a batch of rows to the appropriate writer.
For partitioned output, creates writer lazily if needed.
Returns (writer, num_rows) - writer used and number of rows written.
"""
num_rows = len(row_buffer.get("revid", []))
if self.partition_namespaces and namespace is not None:
if namespace not in pq_writers:
base_path = ns_base_paths[namespace]
part_num = part_numbers.get(namespace, 0) if part_numbers else 0
if self.max_revisions_per_file > 0:
ns_path = self._get_part_path(base_path, part_num)
else:
ns_path = base_path
Path(ns_path).parent.mkdir(exist_ok=True, parents=True)
pq_writers[namespace] = pq.ParquetWriter(
ns_path, schema, flavor="spark", sorting_columns=sorting_cols
)
writer = pq_writers[namespace]
writer.write(pa.record_batch(row_buffer, schema=schema))
return writer, num_rows
def make_matchmake_pairs(self, patterns, labels) -> list[RegexPair]:
if (patterns is not None and labels is not None) and (
len(patterns) == len(labels)
):
result: list[RegexPair] = []
for pattern, label in zip(patterns, labels):
rp = RegexPair(pattern, label)
result.append(rp)
self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
return result
elif (patterns is None) and (labels is None):
return []
else:
sys.exit(
"Each regular expression *must* come with a corresponding label and vice versa."
)
def matchmake_revision(self, rev: mwxml.Revision):
result = self.matchmake_text(rev.text)
for k, v in self.matchmake_comment(rev.comment).items():
result[k] = v
return result
def matchmake_text(self, text: str):
return self.matchmake_pairs(text, self.regex_revision_pairs)
def matchmake_comment(self, comment: str):
return self.matchmake_pairs(comment, self.regex_comment_pairs)
@staticmethod
def matchmake_pairs(text, pairs):
result = {}
for pair in pairs:
for k, v in pair.matchmake(text).items():
result[k] = v
return result
def __get_namespace_from_title(self, title):
default_ns = None
for ns in self.namespaces:
# skip if the namespace is not defined
if ns is None:
default_ns = self.namespaces[ns]
continue
if title.startswith(ns + ":"):
return self.namespaces[ns]
# if we've made it this far with no matches, we return the default namespace
return default_ns
def process(self):
# Start time limit timer if configured
time_limit_timer = self._start_time_limit_timer()
# Track whether we've passed the resume point
if self.resume_point is None:
found_resume_point = True
elif self.partition_namespaces:
found_resume_point = {}
else:
found_resume_point = False
# When resuming with parquet, write new data to temp file/directory and merge at the end
original_output_file = None
temp_output_file = None
original_partition_dir = None
if self.resume_point is not None and self.output_parquet:
original_output_file, temp_output_file, original_partition_dir = \
setup_resume_temp_output(self.output_file, self.partition_namespaces)
if temp_output_file is not None:
self.output_file = temp_output_file
# Open checkpoint file for tracking resume point
checkpoint_output = original_output_file if original_output_file else self.output_file
self._open_checkpoint(checkpoint_output)
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
table, reverts_column = build_table(
text=self.text,
collapse_user=self.collapse_user,
external_links=self.external_links,
citations=self.citations,
wikilinks=self.wikilinks,
templates=self.templates,
headings=self.headings,
)
# Extract list of namespaces
self.namespaces = {
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
}
page_count = 0
rev_count = 0
schema = build_schema(
table,
diff=self.diff,
persist=self.persist,
text=self.text,
regex_revision_pairs=self.regex_revision_pairs,
regex_comment_pairs=self.regex_comment_pairs,
)
# Add RevisionText to table for diff/persist computation (extraction only, not output)
if (self.diff or self.persist != PersistMethod.none) and not self.text:
table.columns.append(tables.RevisionText())
# Initialize writer
writer = None
sorting_cols = None
ns_base_paths = {}
pq_writers = {}
part_numbers = {}
if self.output_parquet:
pageid_sortingcol = pq.SortingColumn(schema.get_field_index("articleid"))
revid_sortingcol = pq.SortingColumn(schema.get_field_index("revid"))
sorting_cols = [pageid_sortingcol, revid_sortingcol]
if self.resume_point is not None:
if self.partition_namespaces:
for ns, resume_data in self.resume_point.items():
part_numbers[ns] = resume_data[2] if len(resume_data) > 2 else 0
else:
part_numbers[None] = self.resume_point[2] if len(self.resume_point) > 2 else 0
if not self.partition_namespaces:
if self.max_revisions_per_file > 0:
output_path_with_part = self._get_part_path(self.output_file, part_numbers.get(None, 0))
else:
output_path_with_part = self.output_file
writer = pq.ParquetWriter(
output_path_with_part,
schema,
flavor="spark",
sorting_columns=sorting_cols,
)
else:
output_path = Path(self.output_file)
if self.namespace_filter is not None:
namespaces = self.namespace_filter
else:
namespaces = self.namespaces.values()
ns_base_paths = {
ns: (output_path.parent / f"namespace={ns}") / output_path.name
for ns in namespaces
}
for ns in namespaces:
if ns not in part_numbers:
part_numbers[ns] = 0
elif self.output_jsonl:
append_mode = self.resume_point is not None
if self.output_jsonl_dir:
# Create directory for JSONL output
Path(self.output_file).mkdir(parents=True, exist_ok=True)
part_num = 0
if self.resume_point is not None and len(self.resume_point) > 2:
part_num = self.resume_point[2]
part_numbers[None] = part_num
jsonl_path = self._get_part_path(
Path(self.output_file) / "data.jsonl", part_num
)
writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
else:
writer = JSONLWriter(self.output_file, schema, append=append_mode)
else:
writer = pacsv.CSVWriter(
self.output_file,
schema,
write_options=pacsv.WriteOptions(delimiter="\t"),
)
# Initialize diff machinery
differ = None
fast_differ = None
if self.diff:
differ = pywikidiff2.pywikidiff2(
num_context_lines=1000000,
max_word_level_diff_complexity=-1,
moved_paragraph_detection_cutoff=-1,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=10000,
)
fast_differ = pywikidiff2.pywikidiff2(
num_context_lines=1000000,
max_word_level_diff_complexity=40000000,
moved_paragraph_detection_cutoff=100,
words_cache_capacity=-1,
diff_cache_capacity=-1,
stats_cache_capacity=-1,
)
# Write buffer: accumulate rows before flushing
write_buffer = defaultdict(list)
buffer_count = 0
last_namespace = None
def flush_buffer():
nonlocal write_buffer, buffer_count, last_namespace
if buffer_count == 0:
return
row_buffer = dict(write_buffer)
namespace = last_namespace
if self.output_parquet:
if self.partition_namespaces:
self._write_batch(
row_buffer, schema, writer, pq_writers, ns_base_paths,
sorting_cols, namespace=namespace, part_numbers=part_numbers
)
else:
writer.write(pa.record_batch(row_buffer, schema=schema))
elif self.output_jsonl:
writer.write_batch(row_buffer)
else:
writer.write(pa.record_batch(row_buffer, schema=schema))
# Update checkpoint
last_pageid = row_buffer["articleid"][-1]
last_revid = row_buffer["revid"][-1]
part = part_numbers.get(namespace if self.partition_namespaces else None, 0)
self._update_checkpoint(last_pageid, last_revid,
namespace=namespace if self.partition_namespaces else None,
part=part)
write_buffer = defaultdict(list)
buffer_count = 0
# Iterate through pages
for page in dump:
# Skip namespaces not in the filter
if self.namespace_filter is not None:
if page.mwpage.namespace not in self.namespace_filter:
continue
# Resume logic: skip pages before the resume point
is_resume_page = False
page_resume_revid = None
if self.resume_point is not None and not found_resume_point:
page_id = page.mwpage.id
resume_pageid, resume_revid = self.resume_point[0], self.resume_point[1]
if page_id < resume_pageid:
continue
elif page_id == resume_pageid:
is_resume_page = True
page_resume_revid = resume_revid
else:
found_resume_point = True
# Reset revert detector for new page
if self.revert_radius > 0:
reverts_column.rev_detector = mwreverts.Detector(radius=self.revert_radius)
else:
reverts_column.rev_detector = None
# State for this page
prev_text = ""
persist_state = None
persist_window = None
if self.persist != PersistMethod.none:
persist_window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist == PersistMethod.sequence:
persist_state = mwpersistence.DiffState(
SequenceMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS,
)
elif self.persist == PersistMethod.segment:
persist_state = mwpersistence.DiffState(
SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS,
)
elif self.persist == PersistMethod.wikidiff2:
wikidiff_matcher = WikiDiffMatcher(tokenizer=wikitext_split)
persist_state = mwpersistence.DiffState(
wikidiff_matcher, revert_radius=PERSISTENCE_RADIUS
)
else:
from mw.lib import persistence
persist_state = persistence.State()
# Pending persistence values waiting for window to fill
pending_persistence = []
# Use peekable to detect last revision in page
revs_iter = peekable(page)
for revs in revs_iter:
# revs is either a single revision or a group (collapse_user mode)
revs = list(revs)
revs = fix_hex_digests(revs)
rev = revs[-1] # Last revision in the group
is_last_in_page = revs_iter.peek(None) is None
# Skip revisions before resume point
if is_resume_page:
if rev.id <= page_resume_revid:
# Update state for correctness when we resume output
if self.diff or self.persist != PersistMethod.none:
prev_text = rev.text or ""
if persist_state is not None:
text = rev.text or ""
if self.persist != PersistMethod.legacy:
persist_state.update(text, rev.id)
else:
persist_state.process(text, rev.id)
# Update revert detector so it has history for post-resume revisions
if reverts_column.rev_detector is not None and not rev.deleted.text:
reverts_column.rev_detector.process(rev.sha1, rev.id)
if rev.id == page_resume_revid:
found_resume_point = True
is_resume_page = False
print(f"Resuming output after revid {rev.id}", file=sys.stderr)
continue
rev_count += 1
# Extract base row data
row = table.extract_row(page.mwpage, revs)
# Compute revert flag
if self.revert_radius == 0 or row["deleted"]:
row["revert"] = None
else:
row["revert"] = row["reverteds"] is not None
# Regex matching
regex_dict = self.matchmake_revision(rev)
for k, v in regex_dict.items():
row[k] = v
# Compute diff
text = row.get("text", "") or ""
if self.diff:
diff_result, timed_out = diff_with_timeout(differ, prev_text, text)
if timed_out:
print(f"WARNING! wikidiff2 timeout for rev: {rev.id}. Falling back to default limits.", file=sys.stderr)
diff_result = fast_differ.inline_json_diff(prev_text, text)
row["diff"] = [entry for entry in json.loads(diff_result)["diff"] if entry["type"] != 0]
row["diff_timeout"] = timed_out
# Compute persistence
if persist_state is not None:
if self.persist != PersistMethod.legacy:
_, tokens_added, tokens_removed = persist_state.update(text, rev.id)
else:
_, tokens_added, tokens_removed = persist_state.process(text, rev.id)
persist_window.append((rev.id, tokens_added, tokens_removed))
pending_persistence.append(row)
# When window is full, emit persistence for oldest revision
if len(persist_window) == PERSISTENCE_RADIUS:
old_rev_id, old_tokens_added, old_tokens_removed = persist_window.popleft()
oldest_row = pending_persistence.pop(0)
num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
oldest_row["token_revs"] = num_token_revs
oldest_row["tokens_added"] = num_tokens
oldest_row["tokens_removed"] = len(old_tokens_removed)
oldest_row["tokens_window"] = PERSISTENCE_RADIUS - 1
# Remove text if not outputting it
if not self.text and "text" in oldest_row:
del oldest_row["text"]
# Add to write buffer
for k, v in oldest_row.items():
write_buffer[k].append(v)
buffer_count += 1
last_namespace = page.mwpage.namespace
if buffer_count >= self.batch_size:
flush_buffer()
# Update prev_text for next iteration
if self.diff or self.persist != PersistMethod.none:
prev_text = text
# If no persistence, write row directly
if persist_state is None:
if not self.text and "text" in row:
del row["text"]
for k, v in row.items():
write_buffer[k].append(v)
buffer_count += 1
last_namespace = page.mwpage.namespace
if buffer_count >= self.batch_size:
flush_buffer()
# Check for shutdown
if self.shutdown_requested:
print("Shutdown requested, closing writers...", file=sys.stderr)
break
# End of page: flush remaining persistence window
if persist_state is not None and not self.shutdown_requested:
for i, (pending_row, window_item) in enumerate(zip(pending_persistence, persist_window)):
rev_id, tokens_added, tokens_removed = window_item
num_token_revs, num_tokens = calculate_persistence(tokens_added)
pending_row["token_revs"] = num_token_revs
pending_row["tokens_added"] = num_tokens
pending_row["tokens_removed"] = len(tokens_removed)
pending_row["tokens_window"] = len(persist_window) - (i + 1)
if not self.text and "text" in pending_row:
del pending_row["text"]
for k, v in pending_row.items():
write_buffer[k].append(v)
buffer_count += 1
last_namespace = page.mwpage.namespace
if self.shutdown_requested:
break
page_count += 1
# Flush remaining buffer
flush_buffer()
# Cancel time limit timer
self._cancel_time_limit_timer(time_limit_timer)
print(
"Done: %s revisions and %s pages." % (rev_count, page_count),
file=sys.stderr,
)
# Close all writers
if self.output_parquet and self.partition_namespaces:
for pq_writer in pq_writers.values():
pq_writer.close()
elif writer is not None:
writer.close()
# Close checkpoint file; delete it only if we completed without interruption
self._close_checkpoint(delete=not self.shutdown_requested)
# Merge temp output with original for parquet resume
if original_output_file is not None and temp_output_file is not None:
finalize_resume_merge(
original_output_file,
temp_output_file,
self.partition_namespaces,
original_partition_dir
)
def match_archive_suffix(input_filename):
if re.match(r".*\.7z$", input_filename):
cmd = ["7za", "x", "-so", input_filename]
elif re.match(r".*\.gz$", input_filename):
cmd = ["zcat", input_filename]
elif re.match(r".*\.bz2$", input_filename):
cmd = ["bzcat", "-dk", input_filename]
else:
raise ValueError("Unrecognized file type: %s" % input_filename)
return cmd
def open_input_file(input_filename, fandom_2020=False):
cmd = match_archive_suffix(input_filename)
if fandom_2020:
cmd.append("*.xml")
try:
return Popen(cmd, stdout=PIPE).stdout
except NameError:
return open(input_filename, "r")
def get_output_filename(input_filename, output_format='tsv') -> str:
"""Generate output filename based on input filename and format.
Args:
input_filename: Input dump file path
output_format: 'tsv', 'jsonl', or 'parquet'
"""
output_filename = re.sub(r"\.(7z|gz|bz2)?$", "", input_filename)
output_filename = re.sub(r"\.xml", "", output_filename)
if output_format == 'jsonl':
output_filename = output_filename + ".jsonl"
elif output_format == 'parquet':
output_filename = output_filename + ".parquet"
else:
output_filename = output_filename + ".tsv"
return output_filename
def main():
parser = argparse.ArgumentParser(
description="Parse MediaWiki XML database dumps into tab delimited data."
)
# arguments for the input direction
parser.add_argument(
"dumpfiles",
metavar="DUMPFILE",
nargs="*",
type=str,
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.",
)
parser.add_argument(
"-o",
"--output",
metavar="OUTPUT",
dest="output",
type=str,
nargs=1,
help="Output file or directory. Format is detected from extension: .jsonl for JSONL, .parquet for Parquet, otherwise TSV.",
)
parser.add_argument(
"-s",
"--stdout",
dest="stdout",
action="store_true",
help="Write output to standard out (do not create dump file)",
)
parser.add_argument(
"--print-schema",
dest="print_schema",
action="store_true",
help="Print the Spark-compatible JSON schema for the output and exit. No dump file is processed.",
)
parser.add_argument(
"--collapse-user",
dest="collapse_user",
action="store_true",
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.",
)
parser.add_argument(
"-p",
"--persistence",
dest="persist",
default=None,
const="",
type=str,
choices=["", "wikidiff2", "segment", "sequence", "legacy"],
nargs="?",
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.",
)
parser.add_argument(
"-n",
"--namespace-include",
dest="namespace_filter",
type=int,
action="append",
help="Id number of namespace to include. Can be specified more than once.",
)
parser.add_argument(
"-rr",
"--revert-radius",
dest="revert_radius",
type=int,
action="store",
default=15,
help="Number of edits to check when looking for reverts (default: 15)",
)
parser.add_argument(
"-RP",
"--revision-pattern",
dest="regex_match_revision",
default=None,
type=str,
action="append",
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.",
)
parser.add_argument(
"-RPl",
"--revision-pattern-label",
dest="regex_revision_label",
default=None,
type=str,
action="append",
help="The label for the outputted column based on matching the regex in revision text.",
)
parser.add_argument(
"-CP",
"--comment-pattern",
dest="regex_match_comment",
default=None,
type=str,
action="append",
help="The regular expression to search for in comments of revisions.",
)
parser.add_argument(
"-CPl",
"--comment-pattern-label",
dest="regex_comment_label",
default=None,
type=str,
action="append",
help="The label for the outputted column based on matching the regex in comments.",
)
parser.add_argument(
"-d",
"--diff",
dest="diff",
default=False,
action="store_true",
help="Output a diff structure for each revision with information about changed or moved lines.",
)
parser.add_argument(
"-t",
"--text",
dest="text",
default=False,
action="store_true",
help="Output the text of the revision.",
)
parser.add_argument(
"--external-links",
dest="external_links",
action="store_true",
default=False,
help="Extract external links from each revision using mwparserfromhell.",
)
parser.add_argument(
"--citations",
dest="citations",
action="store_true",
default=False,
help="Extract citations (ref tags and cite templates) from each revision.",
)
parser.add_argument(
"--wikilinks",
dest="wikilinks",
action="store_true",
default=False,
help="Extract internal wikilinks from each revision.",
)
parser.add_argument(
"--templates",
dest="templates",
action="store_true",
default=False,
help="Extract templates with their parameters from each revision.",
)
parser.add_argument(
"--headings",
dest="headings",
action="store_true",
default=False,
help="Extract section headings from each revision.",
)
parser.add_argument(
"--fandom-2020",
dest="fandom_2020",
action="store_true",
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.",
)
parser.add_argument(
"--batch-size",
dest="batch_size",
default=1500,
type=int,
help="How many revisions to process in each batch. This ends up being the Parquet row group size",
)
parser.add_argument(
"--resume",
dest="resume",
action="store_true",
help="Resume processing from the last successfully written revision in the output file.",
)
parser.add_argument(
"--time-limit",
dest="time_limit",
type=float,
default=0,
help="Time limit in hours before graceful shutdown. Set to 0 to disable (default).",
)
parser.add_argument(
"--partition-namespaces",
dest="partition_namespaces",
action="store_true",
default=False,
help="For Parquet output, partition output by namespace into separate files.",
)
parser.add_argument(
"--max-revisions-per-file",
dest="max_revisions_per_file",
type=int,
default=0,
help="For Parquet output, split output into multiple files after this many revisions. Set to 0 to disable (default).",
)
args = parser.parse_args()
# set persistence method
if args.persist is None:
persist = PersistMethod.none
elif args.persist == "segment":
persist = PersistMethod.segment
elif args.persist == "legacy":
persist = PersistMethod.legacy
elif args.persist == "wikidiff2":
persist = PersistMethod.wikidiff2
else:
persist = PersistMethod.sequence
if args.namespace_filter is not None:
namespaces = args.namespace_filter
else:
namespaces = None
# Handle --print-schema: build and output schema, then exit
if args.print_schema:
regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label)
regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label)
table, _ = build_table(
text=args.text,
collapse_user=args.collapse_user,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
templates=args.templates,
headings=args.headings,
)
schema = build_schema(
table,
diff=args.diff,
persist=persist,
text=args.text,
regex_revision_pairs=regex_revision_pairs,
regex_comment_pairs=regex_comment_pairs,
)
spark_schema = pyarrow_to_spark_schema(schema)
print(json.dumps(spark_schema, indent=2))
sys.exit(0)
print(args, file=sys.stderr)
if len(args.dumpfiles) > 0:
for filename in args.dumpfiles:
# Determine output file path before opening input (so resume errors are caught early)
if args.output:
output = args.output[0]
else:
output = "."
# Detect output format from extension
output_jsonl_dir = output.endswith(".jsonl.d")
output_jsonl = output.endswith(".jsonl") or output_jsonl_dir
output_parquet = output.endswith(".parquet")
partition_namespaces = args.partition_namespaces and output_parquet
if args.stdout:
output_file = sys.stdout.buffer
elif output_jsonl or output_parquet:
# Output is a JSONL or Parquet file path - use it directly
output_file = output
elif os.path.isdir(output):
# Output is a directory - derive filename from input
output_filename = os.path.join(output, os.path.basename(filename))
output_file = get_output_filename(output_filename, output_format='tsv')
else:
output_file = output
# Handle resume functionality before opening input file
resume_point = None
if args.resume:
if (output_jsonl or output_parquet) and not args.stdout:
# Clean up any interrupted resume from previous run
if output_parquet:
cleanup_result = cleanup_interrupted_resume(output_file, partition_namespaces)
if cleanup_result == "start_fresh":
resume_point = None
else:
resume_point = get_resume_point(output_file, partition_namespaces)
else:
# JSONL: get resume point from last line of file (no checkpoint)
resume_point = get_resume_point(output_file, input_file=filename)
if resume_point is not None:
if isinstance(resume_point, dict):
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)
else:
pageid, revid = resume_point[0], resume_point[1]
print(f"Resuming from checkpoint: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
sys.exit("Error: --resume only works with JSONL or Parquet output (not stdout or TSV)")
# Now open the input file
print("Processing file: %s" % filename, file=sys.stderr)
input_file = open_input_file(filename, args.fandom_2020)
time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None
wikiq = WikiqParser(
input_file,
output_file,
collapse_user=args.collapse_user,
persist=persist,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label,
text=args.text,
diff=args.diff,
output_jsonl=output_jsonl,
output_jsonl_dir=output_jsonl_dir,
output_parquet=output_parquet,
partition_namespaces=partition_namespaces,
batch_size=args.batch_size,
resume_point=resume_point,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
templates=args.templates,
headings=args.headings,
time_limit_seconds=time_limit_seconds,
max_revisions_per_file=args.max_revisions_per_file,
)
# Register signal handlers for graceful shutdown (CLI only)
def handle_shutdown(signum, frame):
sig_name = signal.Signals(signum).name
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
wikiq.request_shutdown()
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
original_sigusr2 = signal.signal(signal.SIGUSR2, handle_shutdown)
try:
wikiq.process()
finally:
# Restore original signal handlers
signal.signal(signal.SIGTERM, original_sigterm)
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGUSR1, original_sigusr1)
signal.signal(signal.SIGUSR2, original_sigusr2)
# close things
input_file.close()
else:
if args.resume:
print("Warning: --resume cannot be used with stdin/stdout", file=sys.stderr)
time_limit_seconds = args.time_limit * 3600 if args.time_limit > 0 else None
wikiq = WikiqParser(
sys.stdin,
sys.stdout,
collapse_user=args.collapse_user,
persist=persist,
# persist_legacy=args.persist_legacy,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label,
diff=args.diff,
text=args.text,
batch_size=args.batch_size,
resume_point=None,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
templates=args.templates,
headings=args.headings,
time_limit_seconds=time_limit_seconds,
)
# Register signal handlers for graceful shutdown (CLI only)
def handle_shutdown(signum, frame):
sig_name = signal.Signals(signum).name
print(f"\nReceived {sig_name}, requesting graceful shutdown...", file=sys.stderr)
wikiq.request_shutdown()
original_sigterm = signal.signal(signal.SIGTERM, handle_shutdown)
original_sigint = signal.signal(signal.SIGINT, handle_shutdown)
original_sigusr1 = signal.signal(signal.SIGUSR1, handle_shutdown)
try:
wikiq.process()
finally:
# Restore original signal handlers
signal.signal(signal.SIGTERM, original_sigterm)
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGUSR1, original_sigusr1)
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
# stop_words = stop_words.split(",")
if __name__ == "__main__":
main()