Get Parquet test working

This requires some data smoothing to get read_table and read_parquet
DataFrames to look close enough, but the test now passes and validates
that the data match.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-28 16:48:58 -05:00
parent 52757a8239
commit ee01ce3e61
2 changed files with 44 additions and 9 deletions

View File

@ -1,12 +1,18 @@
import math
import unittest import unittest
import os import os
import subprocess import subprocess
from shutil import copyfile from shutil import copyfile
import numpy as np
import pandas as pd import pandas as pd
from pandas import DataFrame
from pandas._testing import assert_series_equal
from pandas.testing import assert_frame_equal from pandas.testing import assert_frame_equal
from io import StringIO from io import StringIO
import tracemalloc import tracemalloc
from typing import Final from typing import Final
from datetime import datetime
# Make references to files and wikiq relative to this file, not to the current working directory. # Make references to files and wikiq relative to this file, not to the current working directory.
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
@ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon"
TWINPEAKS: Final[str] = "twinpeaks" TWINPEAKS: Final[str] = "twinpeaks"
REGEXTEST: Final[str] = "regextest" REGEXTEST: Final[str] = "regextest"
def setup(): def setup():
tracemalloc.start() tracemalloc.start()
@ -41,6 +48,7 @@ class WikiqTester:
case_name: str | None = None, case_name: str | None = None,
suffix: str | None = None, suffix: str | None = None,
in_compression: str = "bz2", in_compression: str = "bz2",
baseline_format: str = "tsv",
out_format: str = "tsv", out_format: str = "tsv",
): ):
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
@ -51,14 +59,16 @@ class WikiqTester:
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR) self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
if suffix is None: if suffix is None:
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
else: else:
self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format)) self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
# If case_name is unset, there are no relevant baseline or test files. # If case_name is unset, there are no relevant baseline or test files.
if case_name is not None: if case_name is not None:
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name)) self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
if os.path.exists(self.test_file): if os.path.exists(self.test_file):
os.remove(self.test_file) os.remove(self.test_file)
@ -78,6 +88,7 @@ class WikiqTester:
print(call) print(call)
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
# with / without pwr DONE # with / without pwr DONE
# with / without url encode DONE # with / without url encode DONE
# with / without collapse user DONE # with / without collapse user DONE
@ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self): def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode") tester = WikiqTester(IKWIKI, "url-encode")
@ -256,7 +266,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
try: try:
outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") outs = tester.call_wikiq("--stdout", out=False).decode("utf8")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) self.fail(exc.stderr.decode("utf8"))
@ -304,7 +314,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))
try: try:
tester.call_wikiq( arguments) tester.call_wikiq(arguments)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) self.fail(exc.stderr.decode("utf8"))
@ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_parquet(self): def test_parquet(self):
tester = WikiqTester(IKWIKI, "parquet", out_format="parquet") tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
try: try:
tester.call_wikiq() tester.call_wikiq()
@ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase):
copyfile(tester.call_output, tester.test_file) copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_parquet(tester.test_file) test: DataFrame = pd.read_parquet(tester.test_file)
baseline = pd.read_table(tester.baseline_file) # test = test.drop(['reverteds'], axis=1)
assert_frame_equal(test, baseline, check_like=True)
baseline: DataFrame = pd.read_table(tester.baseline_file)
# Pandas does not read timestamps as the desired datetime type.
baseline['date_time'] = pd.to_datetime(baseline['date_time'])
# Split strings to the arrays of reverted IDs so they can be compared.
baseline['revert'] = baseline['revert'].replace(np.nan, None)
baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
baseline['editor'] = baseline['editor'].replace(np.nan, None)
baseline['anon'] = baseline['anon'].replace(np.nan, None)
for index, row in baseline.iterrows():
if row['editorid'] is None or test['editorid'][index] is None:
if row['editorid'] != test['editorid'][index]:
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
for col in baseline.columns:
try:
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
except ValueError as exc:
print(f"Error comparing column {col}")
self.fail(exc)
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

2
wikiq
View File

@ -270,7 +270,7 @@ class RevDataBase:
pa.field("title", pa.string()), pa.field("title", pa.string()),
pa.field("namespace", pa.int32()), pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()), pa.field("deleted", pa.bool_()),
pa.field("test_chars", pa.int32()), pa.field("text_chars", pa.int32()),
pa.field("revert", pa.bool_()), pa.field("revert", pa.bool_()),
pa.field("reverteds", pa.list_(pa.int64())), pa.field("reverteds", pa.list_(pa.int64())),
pa.field("sha1", pa.string()), pa.field("sha1", pa.string()),