Get Parquet test working
This requires some data smoothing to get read_table and read_parquet DataFrames to look close enough, but the test now passes and validates that the data match. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
52757a8239
commit
ee01ce3e61
@ -1,12 +1,18 @@
|
|||||||
|
import math
|
||||||
import unittest
|
import unittest
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
from pandas._testing import assert_series_equal
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import tracemalloc
|
import tracemalloc
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
# Make references to files and wikiq relative to this file, not to the current working directory.
|
# Make references to files and wikiq relative to this file, not to the current working directory.
|
||||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||||
@ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon"
|
|||||||
TWINPEAKS: Final[str] = "twinpeaks"
|
TWINPEAKS: Final[str] = "twinpeaks"
|
||||||
REGEXTEST: Final[str] = "regextest"
|
REGEXTEST: Final[str] = "regextest"
|
||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup():
|
||||||
tracemalloc.start()
|
tracemalloc.start()
|
||||||
|
|
||||||
@ -41,6 +48,7 @@ class WikiqTester:
|
|||||||
case_name: str | None = None,
|
case_name: str | None = None,
|
||||||
suffix: str | None = None,
|
suffix: str | None = None,
|
||||||
in_compression: str = "bz2",
|
in_compression: str = "bz2",
|
||||||
|
baseline_format: str = "tsv",
|
||||||
out_format: str = "tsv",
|
out_format: str = "tsv",
|
||||||
):
|
):
|
||||||
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
|
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
|
||||||
@ -51,14 +59,16 @@ class WikiqTester:
|
|||||||
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
|
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
|
||||||
|
|
||||||
if suffix is None:
|
if suffix is None:
|
||||||
|
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
|
||||||
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
|
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
|
||||||
else:
|
else:
|
||||||
|
self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
|
||||||
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
|
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
|
||||||
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
|
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
|
||||||
|
|
||||||
# If case_name is unset, there are no relevant baseline or test files.
|
# If case_name is unset, there are no relevant baseline or test files.
|
||||||
if case_name is not None:
|
if case_name is not None:
|
||||||
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
|
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
|
||||||
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
|
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
|
||||||
if os.path.exists(self.test_file):
|
if os.path.exists(self.test_file):
|
||||||
os.remove(self.test_file)
|
os.remove(self.test_file)
|
||||||
@ -78,6 +88,7 @@ class WikiqTester:
|
|||||||
print(call)
|
print(call)
|
||||||
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
|
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
|
||||||
|
|
||||||
|
|
||||||
# with / without pwr DONE
|
# with / without pwr DONE
|
||||||
# with / without url encode DONE
|
# with / without url encode DONE
|
||||||
# with / without collapse user DONE
|
# with / without collapse user DONE
|
||||||
@ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
baseline = pd.read_table(tester.baseline_file)
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
|
|
||||||
def test_WP_url_encode(self):
|
def test_WP_url_encode(self):
|
||||||
tester = WikiqTester(IKWIKI, "url-encode")
|
tester = WikiqTester(IKWIKI, "url-encode")
|
||||||
|
|
||||||
@ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
def test_parquet(self):
|
def test_parquet(self):
|
||||||
tester = WikiqTester(IKWIKI, "parquet", out_format="parquet")
|
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq()
|
tester.call_wikiq()
|
||||||
@ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
copyfile(tester.call_output, tester.test_file)
|
copyfile(tester.call_output, tester.test_file)
|
||||||
|
|
||||||
# as a test let's make sure that we get equal data frames
|
# as a test let's make sure that we get equal data frames
|
||||||
test = pd.read_parquet(tester.test_file)
|
test: DataFrame = pd.read_parquet(tester.test_file)
|
||||||
baseline = pd.read_table(tester.baseline_file)
|
# test = test.drop(['reverteds'], axis=1)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
|
||||||
|
baseline: DataFrame = pd.read_table(tester.baseline_file)
|
||||||
|
|
||||||
|
# Pandas does not read timestamps as the desired datetime type.
|
||||||
|
baseline['date_time'] = pd.to_datetime(baseline['date_time'])
|
||||||
|
# Split strings to the arrays of reverted IDs so they can be compared.
|
||||||
|
baseline['revert'] = baseline['revert'].replace(np.nan, None)
|
||||||
|
baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
|
||||||
|
baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
|
||||||
|
baseline['editor'] = baseline['editor'].replace(np.nan, None)
|
||||||
|
baseline['anon'] = baseline['anon'].replace(np.nan, None)
|
||||||
|
|
||||||
|
for index, row in baseline.iterrows():
|
||||||
|
if row['editorid'] is None or test['editorid'][index] is None:
|
||||||
|
if row['editorid'] != test['editorid'][index]:
|
||||||
|
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
|
||||||
|
|
||||||
|
for col in baseline.columns:
|
||||||
|
try:
|
||||||
|
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
|
||||||
|
except ValueError as exc:
|
||||||
|
print(f"Error comparing column {col}")
|
||||||
|
self.fail(exc)
|
||||||
|
|
||||||
|
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
2
wikiq
2
wikiq
@ -270,7 +270,7 @@ class RevDataBase:
|
|||||||
pa.field("title", pa.string()),
|
pa.field("title", pa.string()),
|
||||||
pa.field("namespace", pa.int32()),
|
pa.field("namespace", pa.int32()),
|
||||||
pa.field("deleted", pa.bool_()),
|
pa.field("deleted", pa.bool_()),
|
||||||
pa.field("test_chars", pa.int32()),
|
pa.field("text_chars", pa.int32()),
|
||||||
pa.field("revert", pa.bool_()),
|
pa.field("revert", pa.bool_()),
|
||||||
pa.field("reverteds", pa.list_(pa.int64())),
|
pa.field("reverteds", pa.list_(pa.int64())),
|
||||||
pa.field("sha1", pa.string()),
|
pa.field("sha1", pa.string()),
|
||||||
|
Loading…
Reference in New Issue
Block a user