Get Parquet test working
This requires some data smoothing to get read_table and read_parquet DataFrames to look close enough, but the test now passes and validates that the data match. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									52757a8239
								
							
						
					
					
						commit
						ee01ce3e61
					
				| @ -1,12 +1,18 @@ | ||||
| import math | ||||
| import unittest | ||||
| import os | ||||
| import subprocess | ||||
| from shutil import copyfile | ||||
| 
 | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from pandas import DataFrame | ||||
| from pandas._testing import assert_series_equal | ||||
| from pandas.testing import assert_frame_equal | ||||
| from io import StringIO | ||||
| import tracemalloc | ||||
| from typing import Final | ||||
| from datetime import datetime | ||||
| 
 | ||||
| # Make references to files and wikiq relative to this file, not to the current working directory. | ||||
| TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | ||||
| @ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon" | ||||
| TWINPEAKS: Final[str] = "twinpeaks" | ||||
| REGEXTEST: Final[str] = "regextest" | ||||
| 
 | ||||
| 
 | ||||
| def setup(): | ||||
|     tracemalloc.start() | ||||
| 
 | ||||
| @ -41,6 +48,7 @@ class WikiqTester: | ||||
|                  case_name: str | None = None, | ||||
|                  suffix: str | None = None, | ||||
|                  in_compression: str = "bz2", | ||||
|                  baseline_format: str = "tsv", | ||||
|                  out_format: str = "tsv", | ||||
|                  ): | ||||
|         self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) | ||||
| @ -51,14 +59,16 @@ class WikiqTester: | ||||
|             self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR) | ||||
| 
 | ||||
|         if suffix is None: | ||||
|             self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) | ||||
|             self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) | ||||
|         else: | ||||
|             self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) | ||||
|             self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) | ||||
|         self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format)) | ||||
| 
 | ||||
|         # If case_name is unset, there are no relevant baseline or test files. | ||||
|         if case_name is not None: | ||||
|             self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) | ||||
|             self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) | ||||
|             self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name)) | ||||
|             if os.path.exists(self.test_file): | ||||
|                 os.remove(self.test_file) | ||||
| @ -78,6 +88,7 @@ class WikiqTester: | ||||
|         print(call) | ||||
|         return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) | ||||
| 
 | ||||
| 
 | ||||
| # with / without pwr DONE | ||||
| # with / without url encode DONE | ||||
| # with / without collapse user DONE | ||||
| @ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
| 
 | ||||
|     def test_WP_url_encode(self): | ||||
|         tester = WikiqTester(IKWIKI, "url-encode") | ||||
| 
 | ||||
| @ -256,7 +266,7 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") | ||||
| 
 | ||||
|         try: | ||||
|             outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") | ||||
|             outs = tester.call_wikiq("--stdout", out=False).decode("utf8") | ||||
|         except subprocess.CalledProcessError as exc: | ||||
|             self.fail(exc.stderr.decode("utf8")) | ||||
| 
 | ||||
| @ -304,7 +314,7 @@ class WikiqTestCase(unittest.TestCase): | ||||
|             tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) | ||||
| 
 | ||||
|             try: | ||||
|                 tester.call_wikiq( arguments) | ||||
|                 tester.call_wikiq(arguments) | ||||
|             except subprocess.CalledProcessError as exc: | ||||
|                 self.fail(exc.stderr.decode("utf8")) | ||||
| 
 | ||||
| @ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase): | ||||
|             assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_parquet(self): | ||||
|         tester = WikiqTester(IKWIKI, "parquet", out_format="parquet") | ||||
|         tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") | ||||
| 
 | ||||
|         try: | ||||
|             tester.call_wikiq() | ||||
| @ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         copyfile(tester.call_output, tester.test_file) | ||||
| 
 | ||||
|         # as a test let's make sure that we get equal data frames | ||||
|         test = pd.read_parquet(tester.test_file) | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
|         test: DataFrame = pd.read_parquet(tester.test_file) | ||||
|         # test = test.drop(['reverteds'], axis=1) | ||||
| 
 | ||||
|         baseline: DataFrame = pd.read_table(tester.baseline_file) | ||||
| 
 | ||||
|         # Pandas does not read timestamps as the desired datetime type. | ||||
|         baseline['date_time'] = pd.to_datetime(baseline['date_time']) | ||||
|         # Split strings to the arrays of reverted IDs so they can be compared. | ||||
|         baseline['revert'] = baseline['revert'].replace(np.nan, None) | ||||
|         baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] | ||||
|         baseline['sha1'] = baseline['sha1'].replace(np.nan, None) | ||||
|         baseline['editor'] = baseline['editor'].replace(np.nan, None) | ||||
|         baseline['anon'] = baseline['anon'].replace(np.nan, None) | ||||
| 
 | ||||
|         for index, row in baseline.iterrows(): | ||||
|             if row['editorid'] is None or test['editorid'][index] is None: | ||||
|                 if row['editorid'] != test['editorid'][index]: | ||||
|                     print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index]) | ||||
| 
 | ||||
|         for col in baseline.columns: | ||||
|             try: | ||||
|                 assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) | ||||
|             except ValueError as exc: | ||||
|                 print(f"Error comparing column {col}") | ||||
|                 self.fail(exc) | ||||
| 
 | ||||
|         # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|  | ||||
							
								
								
									
										2
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								wikiq
									
									
									
									
									
								
							| @ -270,7 +270,7 @@ class RevDataBase: | ||||
|         pa.field("title", pa.string()), | ||||
|         pa.field("namespace", pa.int32()), | ||||
|         pa.field("deleted", pa.bool_()), | ||||
|         pa.field("test_chars", pa.int32()), | ||||
|         pa.field("text_chars", pa.int32()), | ||||
|         pa.field("revert", pa.bool_()), | ||||
|         pa.field("reverteds", pa.list_(pa.int64())), | ||||
|         pa.field("sha1", pa.string()), | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user