Get Parquet test working
This requires some data smoothing to get read_table and read_parquet DataFrames to look close enough, but the test now passes and validates that the data match. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									52757a8239
								
							
						
					
					
						commit
						ee01ce3e61
					
				| @ -1,12 +1,18 @@ | |||||||
|  | import math | ||||||
| import unittest | import unittest | ||||||
| import os | import os | ||||||
| import subprocess | import subprocess | ||||||
| from shutil import copyfile | from shutil import copyfile | ||||||
|  | 
 | ||||||
|  | import numpy as np | ||||||
| import pandas as pd | import pandas as pd | ||||||
|  | from pandas import DataFrame | ||||||
|  | from pandas._testing import assert_series_equal | ||||||
| from pandas.testing import assert_frame_equal | from pandas.testing import assert_frame_equal | ||||||
| from io import StringIO | from io import StringIO | ||||||
| import tracemalloc | import tracemalloc | ||||||
| from typing import Final | from typing import Final | ||||||
|  | from datetime import datetime | ||||||
| 
 | 
 | ||||||
| # Make references to files and wikiq relative to this file, not to the current working directory. | # Make references to files and wikiq relative to this file, not to the current working directory. | ||||||
| TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | ||||||
| @ -20,6 +26,7 @@ SAILORMOON: Final[str] = "sailormoon" | |||||||
| TWINPEAKS: Final[str] = "twinpeaks" | TWINPEAKS: Final[str] = "twinpeaks" | ||||||
| REGEXTEST: Final[str] = "regextest" | REGEXTEST: Final[str] = "regextest" | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def setup(): | def setup(): | ||||||
|     tracemalloc.start() |     tracemalloc.start() | ||||||
| 
 | 
 | ||||||
| @ -41,6 +48,7 @@ class WikiqTester: | |||||||
|                  case_name: str | None = None, |                  case_name: str | None = None, | ||||||
|                  suffix: str | None = None, |                  suffix: str | None = None, | ||||||
|                  in_compression: str = "bz2", |                  in_compression: str = "bz2", | ||||||
|  |                  baseline_format: str = "tsv", | ||||||
|                  out_format: str = "tsv", |                  out_format: str = "tsv", | ||||||
|                  ): |                  ): | ||||||
|         self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) |         self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) | ||||||
| @ -51,14 +59,16 @@ class WikiqTester: | |||||||
|             self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR) |             self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR) | ||||||
| 
 | 
 | ||||||
|         if suffix is None: |         if suffix is None: | ||||||
|  |             self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) | ||||||
|             self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) |             self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) | ||||||
|         else: |         else: | ||||||
|  |             self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) | ||||||
|             self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) |             self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) | ||||||
|         self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format)) |         self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format)) | ||||||
| 
 | 
 | ||||||
|         # If case_name is unset, there are no relevant baseline or test files. |         # If case_name is unset, there are no relevant baseline or test files. | ||||||
|         if case_name is not None: |         if case_name is not None: | ||||||
|             self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) |             self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) | ||||||
|             self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name)) |             self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name)) | ||||||
|             if os.path.exists(self.test_file): |             if os.path.exists(self.test_file): | ||||||
|                 os.remove(self.test_file) |                 os.remove(self.test_file) | ||||||
| @ -78,6 +88,7 @@ class WikiqTester: | |||||||
|         print(call) |         print(call) | ||||||
|         return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) |         return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| # with / without pwr DONE | # with / without pwr DONE | ||||||
| # with / without url encode DONE | # with / without url encode DONE | ||||||
| # with / without collapse user DONE | # with / without collapse user DONE | ||||||
| @ -103,7 +114,6 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         baseline = pd.read_table(tester.baseline_file) |         baseline = pd.read_table(tester.baseline_file) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     def test_WP_url_encode(self): |     def test_WP_url_encode(self): | ||||||
|         tester = WikiqTester(IKWIKI, "url-encode") |         tester = WikiqTester(IKWIKI, "url-encode") | ||||||
| 
 | 
 | ||||||
| @ -256,7 +266,7 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") |         tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") |             outs = tester.call_wikiq("--stdout", out=False).decode("utf8") | ||||||
|         except subprocess.CalledProcessError as exc: |         except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |             self.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
| @ -304,7 +314,7 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|             tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) |             tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 tester.call_wikiq( arguments) |                 tester.call_wikiq(arguments) | ||||||
|             except subprocess.CalledProcessError as exc: |             except subprocess.CalledProcessError as exc: | ||||||
|                 self.fail(exc.stderr.decode("utf8")) |                 self.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
| @ -338,7 +348,7 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|             assert_frame_equal(test, baseline, check_like=True) |             assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_parquet(self): |     def test_parquet(self): | ||||||
|         tester = WikiqTester(IKWIKI, "parquet", out_format="parquet") |         tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             tester.call_wikiq() |             tester.call_wikiq() | ||||||
| @ -348,9 +358,34 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         copyfile(tester.call_output, tester.test_file) |         copyfile(tester.call_output, tester.test_file) | ||||||
| 
 | 
 | ||||||
|         # as a test let's make sure that we get equal data frames |         # as a test let's make sure that we get equal data frames | ||||||
|         test = pd.read_parquet(tester.test_file) |         test: DataFrame = pd.read_parquet(tester.test_file) | ||||||
|         baseline = pd.read_table(tester.baseline_file) |         # test = test.drop(['reverteds'], axis=1) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) | 
 | ||||||
|  |         baseline: DataFrame = pd.read_table(tester.baseline_file) | ||||||
|  | 
 | ||||||
|  |         # Pandas does not read timestamps as the desired datetime type. | ||||||
|  |         baseline['date_time'] = pd.to_datetime(baseline['date_time']) | ||||||
|  |         # Split strings to the arrays of reverted IDs so they can be compared. | ||||||
|  |         baseline['revert'] = baseline['revert'].replace(np.nan, None) | ||||||
|  |         baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] | ||||||
|  |         baseline['sha1'] = baseline['sha1'].replace(np.nan, None) | ||||||
|  |         baseline['editor'] = baseline['editor'].replace(np.nan, None) | ||||||
|  |         baseline['anon'] = baseline['anon'].replace(np.nan, None) | ||||||
|  | 
 | ||||||
|  |         for index, row in baseline.iterrows(): | ||||||
|  |             if row['editorid'] is None or test['editorid'][index] is None: | ||||||
|  |                 if row['editorid'] != test['editorid'][index]: | ||||||
|  |                     print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index]) | ||||||
|  | 
 | ||||||
|  |         for col in baseline.columns: | ||||||
|  |             try: | ||||||
|  |                 assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) | ||||||
|  |             except ValueError as exc: | ||||||
|  |                 print(f"Error comparing column {col}") | ||||||
|  |                 self.fail(exc) | ||||||
|  | 
 | ||||||
|  |         # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     unittest.main() |     unittest.main() | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								wikiq
									
									
									
									
									
								
							| @ -270,7 +270,7 @@ class RevDataBase: | |||||||
|         pa.field("title", pa.string()), |         pa.field("title", pa.string()), | ||||||
|         pa.field("namespace", pa.int32()), |         pa.field("namespace", pa.int32()), | ||||||
|         pa.field("deleted", pa.bool_()), |         pa.field("deleted", pa.bool_()), | ||||||
|         pa.field("test_chars", pa.int32()), |         pa.field("text_chars", pa.int32()), | ||||||
|         pa.field("revert", pa.bool_()), |         pa.field("revert", pa.bool_()), | ||||||
|         pa.field("reverteds", pa.list_(pa.int64())), |         pa.field("reverteds", pa.list_(pa.int64())), | ||||||
|         pa.field("sha1", pa.string()), |         pa.field("sha1", pa.string()), | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user