Add redirect_target column to wikiq output.

Exposes page.redirect as a nullable string column so downstream pipelines
can build redirect maps for link target resolution.
This commit is contained in:
Nathan TeBlunthuis
2026-03-01 20:05:12 -08:00
parent c7eb374ceb
commit 59fea1919e
2 changed files with 9 additions and 2 deletions

View File

@@ -133,6 +133,7 @@ def build_table(
tables.RevisionArticleId(), tables.RevisionArticleId(),
tables.RevisionPageTitle(), tables.RevisionPageTitle(),
tables.RevisionNamespace(), tables.RevisionNamespace(),
tables.RevisionRedirectTarget(),
tables.RevisionDeleted(), tables.RevisionDeleted(),
tables.RevisionEditorId(), tables.RevisionEditorId(),
tables.RevisionEditSummary(), tables.RevisionEditSummary(),
@@ -1004,7 +1005,7 @@ class WikiqParser:
if rev.id == page_resume_revid: if rev.id == page_resume_revid:
found_resume_point = True found_resume_point = True
is_resume_page = False is_resume_page = False
print(f"Resuming output after revid {rev.id}", file=sys.stderr) print(f"Resuming output after revid {rev.id}", file=sys.stderr, flush=True)
continue continue
rev_count += 1 rev_count += 1

View File

@@ -1,4 +1,3 @@
import sys
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
from datetime import datetime, timezone from datetime import datetime, timezone
from hashlib import sha1 from hashlib import sha1
@@ -144,6 +143,13 @@ class RevisionNamespace(RevisionField[int]):
return page.namespace return page.namespace
class RevisionRedirectTarget(RevisionField[Union[str, None]]):
field = pa.field("redirect_target", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
return page.redirect
class RevisionSha1(RevisionField[str]): class RevisionSha1(RevisionField[str]):
field = pa.field("sha1", pa.string()) field = pa.field("sha1", pa.string())