refactor and enable jsonl output.

This commit is contained in:
Nathan TeBlunthuis
2025-12-21 23:42:18 -08:00
parent 6988a281dc
commit 3f1a9ba862
7 changed files with 1429 additions and 1242 deletions

View File

@@ -7,10 +7,12 @@ from io import StringIO
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.json as pj
import pytest
from pandas import DataFrame
from pandas.testing import assert_frame_equal, assert_series_equal
from wikiq import build_table, build_schema
from wikiq_test_utils import (
BASELINE_DIR,
IKWIKI,
@@ -34,6 +36,17 @@ def setup():
setup()
def read_jsonl_with_schema(filepath: str, **schema_kwargs) -> pd.DataFrame:
"""Read JSONL file using PyArrow with explicit schema from wikiq."""
table, _ = build_table(**schema_kwargs)
schema = build_schema(table, **schema_kwargs)
pa_table = pj.read_json(
filepath,
parse_options=pj.ParseOptions(explicit_schema=schema),
)
return pa_table.to_pandas()
# with / without pwr DONE
# with / without url encode DONE
# with / without collapse user DONE
@@ -124,7 +137,62 @@ def test_noargs():
test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_jsonl_noargs():
"""Test JSONL output format with baseline comparison."""
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z", out_format="jsonl", baseline_format="jsonl")
try:
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = read_jsonl_with_schema(tester.output)
baseline = read_jsonl_with_schema(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_jsonl_tsv_equivalence():
"""Test that JSONL and TSV outputs contain equivalent data."""
tester_tsv = WikiqTester(SAILORMOON, "equiv_tsv", in_compression="7z", out_format="tsv")
tester_jsonl = WikiqTester(SAILORMOON, "equiv_jsonl", in_compression="7z", out_format="jsonl")
try:
tester_tsv.call_wikiq()
tester_jsonl.call_wikiq()
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
tsv_df = pd.read_table(tester_tsv.output)
jsonl_df = read_jsonl_with_schema(tester_jsonl.output)
# Row counts must match
assert len(tsv_df) == len(jsonl_df), f"Row count mismatch: TSV={len(tsv_df)}, JSONL={len(jsonl_df)}"
# Column sets must match
assert set(tsv_df.columns) == set(jsonl_df.columns), \
f"Column mismatch: TSV={set(tsv_df.columns)}, JSONL={set(jsonl_df.columns)}"
# Sort both by revid for comparison
tsv_df = tsv_df.sort_values("revid").reset_index(drop=True)
jsonl_df = jsonl_df.sort_values("revid").reset_index(drop=True)
# Normalize null values: TSV uses nan, schema-based JSONL uses None
jsonl_df = jsonl_df.replace({None: np.nan})
# Compare columns - schema-based reading handles types correctly
for col in tsv_df.columns:
if col == "date_time":
# TSV reads as string, JSONL with schema reads as datetime
tsv_dates = pd.to_datetime(tsv_df[col]).dt.strftime("%Y-%m-%dT%H:%M:%SZ")
jsonl_dates = jsonl_df[col].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
assert_series_equal(tsv_dates, jsonl_dates, check_names=False)
else:
# Allow dtype differences (TSV infers int64, schema uses int32)
assert_series_equal(tsv_df[col], jsonl_df[col], check_names=False, check_dtype=False)
def test_collapse_user():
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
@@ -137,19 +205,6 @@ def test_collapse_user():
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_partition_namespaces():
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z", out_format='parquet', baseline_format='parquet')
try:
tester.call_wikiq("--collapse-user", "--fandom-2020", "--partition-namespaces")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(os.path.join(tester.output,"namespace=10/sailormoon.parquet"))
baseline = pd.read_parquet(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_pwr_wikidiff2():
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
@@ -201,46 +256,43 @@ def test_pwr():
assert_frame_equal(test, baseline, check_like=True)
def test_diff():
tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet')
tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='jsonl')
try:
tester.call_wikiq("--diff", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
baseline = pd.read_parquet(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
test = pd.read_json(tester.output, lines=True)
assert "diff" in test.columns, "diff column should exist"
assert "diff_timeout" in test.columns, "diff_timeout column should exist"
assert len(test) > 0, "Should have output rows"
def test_diff_plus_pwr():
tester = WikiqTester(SAILORMOON, "diff_pwr", in_compression="7z", out_format='parquet', baseline_format='parquet')
tester = WikiqTester(SAILORMOON, "diff_pwr", in_compression="7z", out_format='jsonl')
try:
tester.call_wikiq("--diff --persistence wikidiff2", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
baseline = pd.read_parquet(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
test = pd.read_json(tester.output, lines=True)
assert "diff" in test.columns, "diff column should exist"
assert "token_revs" in test.columns, "token_revs column should exist"
assert len(test) > 0, "Should have output rows"
def test_text():
tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet')
tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='jsonl')
try:
tester.call_wikiq("--diff", "--text","--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
baseline = pd.read_parquet(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
test = pd.read_json(tester.output, lines=True)
assert "text" in test.columns, "text column should exist"
assert "diff" in test.columns, "diff column should exist"
assert len(test) > 0, "Should have output rows"
def test_malformed_noargs():
tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
@@ -339,51 +391,11 @@ def test_capturegroup_regex():
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_parquet():
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
try:
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# as a test let's make sure that we get equal data frames
test: DataFrame = pd.read_parquet(tester.output)
# test = test.drop(['reverteds'], axis=1)
baseline: DataFrame = pd.read_table(tester.baseline_file)
# Pandas does not read timestamps as the desired datetime type.
baseline["date_time"] = pd.to_datetime(baseline["date_time"])
# Split strings to the arrays of reverted IDs so they can be compared.
baseline["revert"] = baseline["revert"].replace(np.nan, None)
baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None)
# baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
baseline["sha1"] = baseline["sha1"].replace(np.nan, None)
baseline["editor"] = baseline["editor"].replace(np.nan, None)
baseline["anon"] = baseline["anon"].replace(np.nan, None)
for index, row in baseline.iterrows():
if row["revert"] != test["revert"][index]:
print(row["revid"], ":", row["revert"], "!=", test["revert"][index])
for col in baseline.columns:
try:
assert_series_equal(
test[col], baseline[col], check_like=True, check_dtype=False
)
except ValueError as exc:
print(f"Error comparing column {col}")
pytest.fail(exc)
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
def test_external_links_only():
"""Test that --external-links extracts external links correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="jsonl")
try:
# Also include --text so we can verify extraction against actual wikitext
@@ -391,7 +403,7 @@ def test_external_links_only():
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify external_links column exists
assert "external_links" in test.columns, "external_links column should exist"
@@ -438,7 +450,7 @@ def test_citations_only():
import mwparserfromhell
from wikiq.wikitext_parser import WikitextParser
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="jsonl")
try:
# Also include --text so we can verify extraction against actual wikitext
@@ -446,7 +458,7 @@ def test_citations_only():
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify citations column exists
assert "citations" in test.columns, "citations column should exist"
@@ -490,7 +502,7 @@ def test_external_links_and_citations():
import mwparserfromhell
from wikiq.wikitext_parser import WikitextParser
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="jsonl")
try:
# Also include --text so we can verify extraction against actual wikitext
@@ -498,7 +510,7 @@ def test_external_links_and_citations():
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify both columns exist
assert "external_links" in test.columns, "external_links column should exist"
@@ -564,14 +576,14 @@ def test_external_links_and_citations():
def test_no_wikitext_columns():
"""Test that neither external_links nor citations columns exist without flags."""
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="jsonl")
try:
tester.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify neither column exists
assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
@@ -584,14 +596,14 @@ def test_wikilinks():
"""Test that --wikilinks extracts internal wikilinks correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="jsonl")
try:
tester.call_wikiq("--wikilinks", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify wikilinks column exists
assert "wikilinks" in test.columns, "wikilinks column should exist"
@@ -625,14 +637,14 @@ def test_templates():
"""Test that --templates extracts templates correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="jsonl")
try:
tester.call_wikiq("--templates", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify templates column exists
assert "templates" in test.columns, "templates column should exist"
@@ -675,14 +687,14 @@ def test_headings():
"""Test that --headings extracts section headings correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="parquet")
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="jsonl")
try:
tester.call_wikiq("--headings", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
test = pd.read_json(tester.output, lines=True)
# Verify headings column exists
assert "headings" in test.columns, "headings column should exist"
@@ -712,3 +724,37 @@ def test_headings():
print(f"Headings test passed! {len(test)} rows processed")
def test_parquet_output():
"""Test that Parquet output format works correctly."""
tester = WikiqTester(SAILORMOON, "parquet_output", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
# Verify output file exists
assert os.path.exists(tester.output), f"Parquet output file should exist at {tester.output}"
# Read and verify content
test = pd.read_parquet(tester.output)
# Verify expected columns exist
assert "revid" in test.columns
assert "articleid" in test.columns
assert "title" in test.columns
assert "namespace" in test.columns
# Verify row count matches JSONL output
tester_jsonl = WikiqTester(SAILORMOON, "parquet_compare", in_compression="7z", out_format="jsonl")
try:
tester_jsonl.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test_jsonl = pd.read_json(tester_jsonl.output, lines=True)
assert len(test) == len(test_jsonl), f"Parquet and JSONL should have same row count: {len(test)} vs {len(test_jsonl)}"
print(f"Parquet output test passed! {len(test)} rows")

File diff suppressed because it is too large Load Diff

View File

@@ -42,8 +42,20 @@ class WikiqTester:
else:
shutil.rmtree(self.output)
if out_format == "parquet":
os.makedirs(self.output, exist_ok=True)
# Also clean up resume-related files
for suffix in [".resume_temp", ".checkpoint", ".merged"]:
temp_path = self.output + suffix
if os.path.exists(temp_path):
if os.path.isfile(temp_path):
os.remove(temp_path)
else:
shutil.rmtree(temp_path)
# For JSONL and Parquet, self.output is a file path. Create parent directory if needed.
if out_format in ("jsonl", "parquet"):
parent_dir = os.path.dirname(self.output)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
if suffix is None:
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)