refactor and enable jsonl output.
This commit is contained in:
@@ -7,10 +7,12 @@ from io import StringIO
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.json as pj
|
||||
import pytest
|
||||
from pandas import DataFrame
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
from wikiq import build_table, build_schema
|
||||
from wikiq_test_utils import (
|
||||
BASELINE_DIR,
|
||||
IKWIKI,
|
||||
@@ -34,6 +36,17 @@ def setup():
|
||||
setup()
|
||||
|
||||
|
||||
def read_jsonl_with_schema(filepath: str, **schema_kwargs) -> pd.DataFrame:
|
||||
"""Read JSONL file using PyArrow with explicit schema from wikiq."""
|
||||
table, _ = build_table(**schema_kwargs)
|
||||
schema = build_schema(table, **schema_kwargs)
|
||||
pa_table = pj.read_json(
|
||||
filepath,
|
||||
parse_options=pj.ParseOptions(explicit_schema=schema),
|
||||
)
|
||||
return pa_table.to_pandas()
|
||||
|
||||
|
||||
# with / without pwr DONE
|
||||
# with / without url encode DONE
|
||||
# with / without collapse user DONE
|
||||
@@ -124,7 +137,62 @@ def test_noargs():
|
||||
test = pd.read_table(tester.output)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
||||
|
||||
def test_jsonl_noargs():
|
||||
"""Test JSONL output format with baseline comparison."""
|
||||
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z", out_format="jsonl", baseline_format="jsonl")
|
||||
|
||||
try:
|
||||
tester.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = read_jsonl_with_schema(tester.output)
|
||||
baseline = read_jsonl_with_schema(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
||||
def test_jsonl_tsv_equivalence():
|
||||
"""Test that JSONL and TSV outputs contain equivalent data."""
|
||||
tester_tsv = WikiqTester(SAILORMOON, "equiv_tsv", in_compression="7z", out_format="tsv")
|
||||
tester_jsonl = WikiqTester(SAILORMOON, "equiv_jsonl", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
tester_tsv.call_wikiq()
|
||||
tester_jsonl.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
tsv_df = pd.read_table(tester_tsv.output)
|
||||
jsonl_df = read_jsonl_with_schema(tester_jsonl.output)
|
||||
|
||||
# Row counts must match
|
||||
assert len(tsv_df) == len(jsonl_df), f"Row count mismatch: TSV={len(tsv_df)}, JSONL={len(jsonl_df)}"
|
||||
|
||||
# Column sets must match
|
||||
assert set(tsv_df.columns) == set(jsonl_df.columns), \
|
||||
f"Column mismatch: TSV={set(tsv_df.columns)}, JSONL={set(jsonl_df.columns)}"
|
||||
|
||||
# Sort both by revid for comparison
|
||||
tsv_df = tsv_df.sort_values("revid").reset_index(drop=True)
|
||||
jsonl_df = jsonl_df.sort_values("revid").reset_index(drop=True)
|
||||
|
||||
# Normalize null values: TSV uses nan, schema-based JSONL uses None
|
||||
jsonl_df = jsonl_df.replace({None: np.nan})
|
||||
|
||||
# Compare columns - schema-based reading handles types correctly
|
||||
for col in tsv_df.columns:
|
||||
if col == "date_time":
|
||||
# TSV reads as string, JSONL with schema reads as datetime
|
||||
tsv_dates = pd.to_datetime(tsv_df[col]).dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
jsonl_dates = jsonl_df[col].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
assert_series_equal(tsv_dates, jsonl_dates, check_names=False)
|
||||
else:
|
||||
# Allow dtype differences (TSV infers int64, schema uses int32)
|
||||
assert_series_equal(tsv_df[col], jsonl_df[col], check_names=False, check_dtype=False)
|
||||
|
||||
|
||||
def test_collapse_user():
|
||||
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
||||
|
||||
@@ -137,19 +205,6 @@ def test_collapse_user():
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_partition_namespaces():
|
||||
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z", out_format='parquet', baseline_format='parquet')
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--collapse-user", "--fandom-2020", "--partition-namespaces")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(os.path.join(tester.output,"namespace=10/sailormoon.parquet"))
|
||||
baseline = pd.read_parquet(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
||||
def test_pwr_wikidiff2():
|
||||
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
|
||||
|
||||
@@ -201,46 +256,43 @@ def test_pwr():
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_diff():
|
||||
tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet')
|
||||
tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='jsonl')
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--diff", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
baseline = pd.read_parquet(tester.baseline_file)
|
||||
|
||||
test = test.reindex(columns=sorted(test.columns))
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
assert "diff" in test.columns, "diff column should exist"
|
||||
assert "diff_timeout" in test.columns, "diff_timeout column should exist"
|
||||
assert len(test) > 0, "Should have output rows"
|
||||
|
||||
def test_diff_plus_pwr():
|
||||
tester = WikiqTester(SAILORMOON, "diff_pwr", in_compression="7z", out_format='parquet', baseline_format='parquet')
|
||||
tester = WikiqTester(SAILORMOON, "diff_pwr", in_compression="7z", out_format='jsonl')
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--diff --persistence wikidiff2", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
baseline = pd.read_parquet(tester.baseline_file)
|
||||
|
||||
test = test.reindex(columns=sorted(test.columns))
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
assert "diff" in test.columns, "diff column should exist"
|
||||
assert "token_revs" in test.columns, "token_revs column should exist"
|
||||
assert len(test) > 0, "Should have output rows"
|
||||
|
||||
def test_text():
|
||||
tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet')
|
||||
tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='jsonl')
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--diff", "--text","--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
baseline = pd.read_parquet(tester.baseline_file)
|
||||
|
||||
test = test.reindex(columns=sorted(test.columns))
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
assert "text" in test.columns, "text column should exist"
|
||||
assert "diff" in test.columns, "diff column should exist"
|
||||
assert len(test) > 0, "Should have output rows"
|
||||
|
||||
def test_malformed_noargs():
|
||||
tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
|
||||
@@ -339,51 +391,11 @@ def test_capturegroup_regex():
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_parquet():
|
||||
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# as a test let's make sure that we get equal data frames
|
||||
test: DataFrame = pd.read_parquet(tester.output)
|
||||
# test = test.drop(['reverteds'], axis=1)
|
||||
|
||||
baseline: DataFrame = pd.read_table(tester.baseline_file)
|
||||
|
||||
# Pandas does not read timestamps as the desired datetime type.
|
||||
baseline["date_time"] = pd.to_datetime(baseline["date_time"])
|
||||
# Split strings to the arrays of reverted IDs so they can be compared.
|
||||
baseline["revert"] = baseline["revert"].replace(np.nan, None)
|
||||
baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None)
|
||||
# baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
|
||||
baseline["sha1"] = baseline["sha1"].replace(np.nan, None)
|
||||
baseline["editor"] = baseline["editor"].replace(np.nan, None)
|
||||
baseline["anon"] = baseline["anon"].replace(np.nan, None)
|
||||
|
||||
for index, row in baseline.iterrows():
|
||||
if row["revert"] != test["revert"][index]:
|
||||
print(row["revid"], ":", row["revert"], "!=", test["revert"][index])
|
||||
|
||||
for col in baseline.columns:
|
||||
try:
|
||||
assert_series_equal(
|
||||
test[col], baseline[col], check_like=True, check_dtype=False
|
||||
)
|
||||
except ValueError as exc:
|
||||
print(f"Error comparing column {col}")
|
||||
pytest.fail(exc)
|
||||
|
||||
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
|
||||
|
||||
|
||||
def test_external_links_only():
|
||||
"""Test that --external-links extracts external links correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
@@ -391,7 +403,7 @@ def test_external_links_only():
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify external_links column exists
|
||||
assert "external_links" in test.columns, "external_links column should exist"
|
||||
@@ -438,7 +450,7 @@ def test_citations_only():
|
||||
import mwparserfromhell
|
||||
from wikiq.wikitext_parser import WikitextParser
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
@@ -446,7 +458,7 @@ def test_citations_only():
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify citations column exists
|
||||
assert "citations" in test.columns, "citations column should exist"
|
||||
@@ -490,7 +502,7 @@ def test_external_links_and_citations():
|
||||
import mwparserfromhell
|
||||
from wikiq.wikitext_parser import WikitextParser
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
@@ -498,7 +510,7 @@ def test_external_links_and_citations():
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify both columns exist
|
||||
assert "external_links" in test.columns, "external_links column should exist"
|
||||
@@ -564,14 +576,14 @@ def test_external_links_and_citations():
|
||||
|
||||
def test_no_wikitext_columns():
|
||||
"""Test that neither external_links nor citations columns exist without flags."""
|
||||
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify neither column exists
|
||||
assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
|
||||
@@ -584,14 +596,14 @@ def test_wikilinks():
|
||||
"""Test that --wikilinks extracts internal wikilinks correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--wikilinks", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify wikilinks column exists
|
||||
assert "wikilinks" in test.columns, "wikilinks column should exist"
|
||||
@@ -625,14 +637,14 @@ def test_templates():
|
||||
"""Test that --templates extracts templates correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--templates", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify templates column exists
|
||||
assert "templates" in test.columns, "templates column should exist"
|
||||
@@ -675,14 +687,14 @@ def test_headings():
|
||||
"""Test that --headings extracts section headings correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="parquet")
|
||||
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="jsonl")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--headings", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
test = pd.read_json(tester.output, lines=True)
|
||||
|
||||
# Verify headings column exists
|
||||
assert "headings" in test.columns, "headings column should exist"
|
||||
@@ -712,3 +724,37 @@ def test_headings():
|
||||
print(f"Headings test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_parquet_output():
|
||||
"""Test that Parquet output format works correctly."""
|
||||
tester = WikiqTester(SAILORMOON, "parquet_output", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Verify output file exists
|
||||
assert os.path.exists(tester.output), f"Parquet output file should exist at {tester.output}"
|
||||
|
||||
# Read and verify content
|
||||
test = pd.read_parquet(tester.output)
|
||||
|
||||
# Verify expected columns exist
|
||||
assert "revid" in test.columns
|
||||
assert "articleid" in test.columns
|
||||
assert "title" in test.columns
|
||||
assert "namespace" in test.columns
|
||||
|
||||
# Verify row count matches JSONL output
|
||||
tester_jsonl = WikiqTester(SAILORMOON, "parquet_compare", in_compression="7z", out_format="jsonl")
|
||||
try:
|
||||
tester_jsonl.call_wikiq("--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test_jsonl = pd.read_json(tester_jsonl.output, lines=True)
|
||||
assert len(test) == len(test_jsonl), f"Parquet and JSONL should have same row count: {len(test)} vs {len(test_jsonl)}"
|
||||
|
||||
print(f"Parquet output test passed! {len(test)} rows")
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -42,8 +42,20 @@ class WikiqTester:
|
||||
else:
|
||||
shutil.rmtree(self.output)
|
||||
|
||||
if out_format == "parquet":
|
||||
os.makedirs(self.output, exist_ok=True)
|
||||
# Also clean up resume-related files
|
||||
for suffix in [".resume_temp", ".checkpoint", ".merged"]:
|
||||
temp_path = self.output + suffix
|
||||
if os.path.exists(temp_path):
|
||||
if os.path.isfile(temp_path):
|
||||
os.remove(temp_path)
|
||||
else:
|
||||
shutil.rmtree(temp_path)
|
||||
|
||||
# For JSONL and Parquet, self.output is a file path. Create parent directory if needed.
|
||||
if out_format in ("jsonl", "parquet"):
|
||||
parent_dir = os.path.dirname(self.output)
|
||||
if parent_dir:
|
||||
os.makedirs(parent_dir, exist_ok=True)
|
||||
|
||||
if suffix is None:
|
||||
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
|
||||
|
||||
Reference in New Issue
Block a user