Add redirect_target column to wikiq output.

Exposes page.redirect as a nullable string column so downstream pipelines
can build redirect maps for link target resolution.
This commit is contained in:
Nathan TeBlunthuis
2026-03-01 20:05:12 -08:00
parent c7eb374ceb
commit 59fea1919e
2 changed files with 9 additions and 2 deletions

View File

@@ -133,6 +133,7 @@ def build_table(
tables.RevisionArticleId(),
tables.RevisionPageTitle(),
tables.RevisionNamespace(),
tables.RevisionRedirectTarget(),
tables.RevisionDeleted(),
tables.RevisionEditorId(),
tables.RevisionEditSummary(),
@@ -1004,7 +1005,7 @@ class WikiqParser:
if rev.id == page_resume_revid:
found_resume_point = True
is_resume_page = False
print(f"Resuming output after revid {rev.id}", file=sys.stderr)
print(f"Resuming output after revid {rev.id}", file=sys.stderr, flush=True)
continue
rev_count += 1

View File

@@ -1,4 +1,3 @@
import sys
from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
@@ -144,6 +143,13 @@ class RevisionNamespace(RevisionField[int]):
return page.namespace
class RevisionRedirectTarget(RevisionField[Union[str, None]]):
field = pa.field("redirect_target", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
return page.redirect
class RevisionSha1(RevisionField[str]):
field = pa.field("sha1", pa.string())