Get Parquet test working
This requires some data smoothing to get read_table and read_parquet DataFrames to look close enough, but the test now passes and validates that the data match. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
52757a8239
commit
ee01ce3e61
@ -1,12 +1,18 @@
|
||||
import math
|
||||
import unittest
|
||||
import os
|
||||
import subprocess
|
||||
from shutil import copyfile
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
from pandas._testing import assert_series_equal
|
||||
from pandas.testing import assert_frame_equal
|
||||
from io import StringIO
|
||||
import tracemalloc
|
||||
from typing import Final
|
||||
from datetime import datetime
|
||||
|
||||
# Make references to files and wikiq relative to this file, not to the current working directory.
|
||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||
@ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon"
|
||||
TWINPEAKS: Final[str] = "twinpeaks"
|
||||
REGEXTEST: Final[str] = "regextest"
|
||||
|
||||
|
||||
def setup():
|
||||
tracemalloc.start()
|
||||
|
||||
@ -41,6 +48,7 @@ class WikiqTester:
|
||||
case_name: str | None = None,
|
||||
suffix: str | None = None,
|
||||
in_compression: str = "bz2",
|
||||
baseline_format: str = "tsv",
|
||||
out_format: str = "tsv",
|
||||
):
|
||||
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
|
||||
@ -51,14 +59,16 @@ class WikiqTester:
|
||||
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
|
||||
|
||||
if suffix is None:
|
||||
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
|
||||
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
|
||||
else:
|
||||
self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
|
||||
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
|
||||
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
|
||||
|
||||
# If case_name is unset, there are no relevant baseline or test files.
|
||||
if case_name is not None:
|
||||
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
|
||||
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
|
||||
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
|
||||
if os.path.exists(self.test_file):
|
||||
os.remove(self.test_file)
|
||||
@ -78,6 +88,7 @@ class WikiqTester:
|
||||
print(call)
|
||||
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
|
||||
|
||||
|
||||
# with / without pwr DONE
|
||||
# with / without url encode DONE
|
||||
# with / without collapse user DONE
|
||||
@ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase):
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
||||
def test_WP_url_encode(self):
|
||||
tester = WikiqTester(IKWIKI, "url-encode")
|
||||
|
||||
@ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_parquet(self):
|
||||
tester = WikiqTester(IKWIKI, "parquet", out_format="parquet")
|
||||
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq()
|
||||
@ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase):
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
|
||||
# as a test let's make sure that we get equal data frames
|
||||
test = pd.read_parquet(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
test: DataFrame = pd.read_parquet(tester.test_file)
|
||||
# test = test.drop(['reverteds'], axis=1)
|
||||
|
||||
baseline: DataFrame = pd.read_table(tester.baseline_file)
|
||||
|
||||
# Pandas does not read timestamps as the desired datetime type.
|
||||
baseline['date_time'] = pd.to_datetime(baseline['date_time'])
|
||||
# Split strings to the arrays of reverted IDs so they can be compared.
|
||||
baseline['revert'] = baseline['revert'].replace(np.nan, None)
|
||||
baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
|
||||
baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
|
||||
baseline['editor'] = baseline['editor'].replace(np.nan, None)
|
||||
baseline['anon'] = baseline['anon'].replace(np.nan, None)
|
||||
|
||||
for index, row in baseline.iterrows():
|
||||
if row['editorid'] is None or test['editorid'][index] is None:
|
||||
if row['editorid'] != test['editorid'][index]:
|
||||
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
|
||||
|
||||
for col in baseline.columns:
|
||||
try:
|
||||
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
|
||||
except ValueError as exc:
|
||||
print(f"Error comparing column {col}")
|
||||
self.fail(exc)
|
||||
|
||||
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
2
wikiq
2
wikiq
@ -270,7 +270,7 @@ class RevDataBase:
|
||||
pa.field("title", pa.string()),
|
||||
pa.field("namespace", pa.int32()),
|
||||
pa.field("deleted", pa.bool_()),
|
||||
pa.field("test_chars", pa.int32()),
|
||||
pa.field("text_chars", pa.int32()),
|
||||
pa.field("revert", pa.bool_()),
|
||||
pa.field("reverteds", pa.list_(pa.int64())),
|
||||
pa.field("sha1", pa.string()),
|
||||
|
Loading…
Reference in New Issue
Block a user