Get Parquet test working

This requires some data smoothing to get read_table and read_parquet
DataFrames to look close enough, but the test now passes and validates
that the data match.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-28 16:48:58 -05:00
parent 52757a8239
commit ee01ce3e61
2 changed files with 44 additions and 9 deletions

View File

@ -1,12 +1,18 @@
import math
import unittest
import os
import subprocess
from shutil import copyfile
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas._testing import assert_series_equal
from pandas.testing import assert_frame_equal
from io import StringIO
import tracemalloc
from typing import Final
from datetime import datetime
# Make references to files and wikiq relative to this file, not to the current working directory.
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
@ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon"
TWINPEAKS: Final[str] = "twinpeaks"
REGEXTEST: Final[str] = "regextest"
def setup():
tracemalloc.start()
@ -41,6 +48,7 @@ class WikiqTester:
case_name: str | None = None,
suffix: str | None = None,
in_compression: str = "bz2",
baseline_format: str = "tsv",
out_format: str = "tsv",
):
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
@ -51,14 +59,16 @@ class WikiqTester:
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
if suffix is None:
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
else:
self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
# If case_name is unset, there are no relevant baseline or test files.
if case_name is not None:
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
if os.path.exists(self.test_file):
os.remove(self.test_file)
@ -78,6 +88,7 @@ class WikiqTester:
print(call)
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
# with / without pwr DONE
# with / without url encode DONE
# with / without collapse user DONE
@ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode")
@ -256,7 +266,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
try:
outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
outs = tester.call_wikiq("--stdout", out=False).decode("utf8")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -304,7 +314,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))
try:
tester.call_wikiq( arguments)
tester.call_wikiq(arguments)
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
assert_frame_equal(test, baseline, check_like=True)
def test_parquet(self):
tester = WikiqTester(IKWIKI, "parquet", out_format="parquet")
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
try:
tester.call_wikiq()
@ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase):
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames
test = pd.read_parquet(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
test: DataFrame = pd.read_parquet(tester.test_file)
# test = test.drop(['reverteds'], axis=1)
baseline: DataFrame = pd.read_table(tester.baseline_file)
# Pandas does not read timestamps as the desired datetime type.
baseline['date_time'] = pd.to_datetime(baseline['date_time'])
# Split strings to the arrays of reverted IDs so they can be compared.
baseline['revert'] = baseline['revert'].replace(np.nan, None)
baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
baseline['editor'] = baseline['editor'].replace(np.nan, None)
baseline['anon'] = baseline['anon'].replace(np.nan, None)
for index, row in baseline.iterrows():
if row['editorid'] is None or test['editorid'][index] is None:
if row['editorid'] != test['editorid'][index]:
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
for col in baseline.columns:
try:
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
except ValueError as exc:
print(f"Error comparing column {col}")
self.fail(exc)
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
if __name__ == '__main__':
unittest.main()

2
wikiq
View File

@ -270,7 +270,7 @@ class RevDataBase:
pa.field("title", pa.string()),
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("test_chars", pa.int32()),
pa.field("text_chars", pa.int32()),
pa.field("revert", pa.bool_()),
pa.field("reverteds", pa.list_(pa.int64())),
pa.field("sha1", pa.string()),