From ee01ce3e61aa7d91ca86b0d8c208d27cb5442812 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Wed, 28 May 2025 16:48:58 -0500 Subject: [PATCH] Get Parquet test working This requires some data smoothing to get read_table and read_parquet DataFrames to look close enough, but the test now passes and validates that the data match. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 51 ++++++++++++++++++++++++++++++++++------- wikiq | 2 +- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 4102ba3..9ae9da0 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -1,12 +1,18 @@ +import math import unittest import os import subprocess from shutil import copyfile + +import numpy as np import pandas as pd +from pandas import DataFrame +from pandas._testing import assert_series_equal from pandas.testing import assert_frame_equal from io import StringIO import tracemalloc from typing import Final +from datetime import datetime # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) @@ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon" TWINPEAKS: Final[str] = "twinpeaks" REGEXTEST: Final[str] = "regextest" + def setup(): tracemalloc.start() @@ -41,6 +48,7 @@ class WikiqTester: case_name: str | None = None, suffix: str | None = None, in_compression: str = "bz2", + baseline_format: str = "tsv", out_format: str = "tsv", ): self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) @@ -51,14 +59,16 @@ class WikiqTester: self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR) if suffix is None: + self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) else: + self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format)) # If case_name is unset, there are no relevant baseline or test files. if case_name is not None: - self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name)) if os.path.exists(self.test_file): os.remove(self.test_file) @@ -78,6 +88,7 @@ class WikiqTester: print(call) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + # with / without pwr DONE # with / without url encode DONE # with / without collapse user DONE @@ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase): baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - def test_WP_url_encode(self): tester = WikiqTester(IKWIKI, "url-encode") @@ -256,7 +266,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") try: - outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq("--stdout", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -304,7 +314,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) try: - tester.call_wikiq( arguments) + tester.call_wikiq(arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) def test_parquet(self): - tester = WikiqTester(IKWIKI, "parquet", out_format="parquet") + tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") try: tester.call_wikiq() @@ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase): copyfile(tester.call_output, tester.test_file) # as a test let's make sure that we get equal data frames - test = pd.read_parquet(tester.test_file) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) + test: DataFrame = pd.read_parquet(tester.test_file) + # test = test.drop(['reverteds'], axis=1) + + baseline: DataFrame = pd.read_table(tester.baseline_file) + + # Pandas does not read timestamps as the desired datetime type. + baseline['date_time'] = pd.to_datetime(baseline['date_time']) + # Split strings to the arrays of reverted IDs so they can be compared. + baseline['revert'] = baseline['revert'].replace(np.nan, None) + baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] + baseline['sha1'] = baseline['sha1'].replace(np.nan, None) + baseline['editor'] = baseline['editor'].replace(np.nan, None) + baseline['anon'] = baseline['anon'].replace(np.nan, None) + + for index, row in baseline.iterrows(): + if row['editorid'] is None or test['editorid'][index] is None: + if row['editorid'] != test['editorid'][index]: + print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index]) + + for col in baseline.columns: + try: + assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) + except ValueError as exc: + print(f"Error comparing column {col}") + self.fail(exc) + + # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) + if __name__ == '__main__': unittest.main() diff --git a/wikiq b/wikiq index 3e278ad..ffd4183 100755 --- a/wikiq +++ b/wikiq @@ -270,7 +270,7 @@ class RevDataBase: pa.field("title", pa.string()), pa.field("namespace", pa.int32()), pa.field("deleted", pa.bool_()), - pa.field("test_chars", pa.int32()), + pa.field("text_chars", pa.int32()), pa.field("revert", pa.bool_()), pa.field("reverteds", pa.list_(pa.int64())), pa.field("sha1", pa.string()),