add (optional) diff and text columns to output.
This commit is contained in:
		
							parent
							
								
									a8e9e7f4fd
								
							
						
					
					
						commit
						d6c4c0a416
					
				| @ -218,3 +218,11 @@ class RevisionCollapsed(RevisionField[int]): | |||||||
| 
 | 
 | ||||||
|     def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: |     def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: | ||||||
|         return len(revisions) |         return len(revisions) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RevisionText(RevisionField[str]): | ||||||
|  |     field = pa.field("text", pa.string(), nullable=False) | ||||||
|  | 
 | ||||||
|  |     def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: | ||||||
|  |         revision = revisions[-1] | ||||||
|  |         return revision.text | ||||||
|  | |||||||
| @ -1,15 +1,14 @@ | |||||||
| import shutil |  | ||||||
| import unittest |  | ||||||
| import os | import os | ||||||
|  | import shutil | ||||||
| import subprocess | import subprocess | ||||||
| 
 | import tracemalloc | ||||||
|  | from io import StringIO | ||||||
|  | from typing import Final, Union | ||||||
|  | import pytest | ||||||
| import numpy as np | import numpy as np | ||||||
| import pandas as pd | import pandas as pd | ||||||
| from pandas import DataFrame | from pandas import DataFrame | ||||||
| from pandas.testing import assert_frame_equal, assert_series_equal | from pandas.testing import assert_frame_equal, assert_series_equal | ||||||
| from io import StringIO |  | ||||||
| import tracemalloc |  | ||||||
| from typing import Final, Union |  | ||||||
| 
 | 
 | ||||||
| # Make references to files and wikiq relative to this file, not to the current working directory. | # Make references to files and wikiq relative to this file, not to the current working directory. | ||||||
| TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) | ||||||
| @ -37,7 +36,8 @@ setup() | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class WikiqTester: | class WikiqTester: | ||||||
|     def __init__(self, |     def __init__( | ||||||
|  |         self, | ||||||
|         wiki: str, |         wiki: str, | ||||||
|         case_name: str, |         case_name: str, | ||||||
|         suffix: Union[str, None] = None, |         suffix: Union[str, None] = None, | ||||||
| @ -45,13 +45,17 @@ class WikiqTester: | |||||||
|         baseline_format: str = "tsv", |         baseline_format: str = "tsv", | ||||||
|         out_format: str = "tsv", |         out_format: str = "tsv", | ||||||
|     ): |     ): | ||||||
|         self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) |         self.input_file = os.path.join( | ||||||
|  |             TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression) | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         basename = "{0}_{1}".format(case_name, wiki) |         basename = "{0}_{1}".format(case_name, wiki) | ||||||
|         if suffix: |         if suffix: | ||||||
|             basename = "{0}_{1}".format(basename, suffix) |             basename = "{0}_{1}".format(basename, suffix) | ||||||
| 
 | 
 | ||||||
|         self.output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)) |         self.output = os.path.join( | ||||||
|  |             TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format) | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         if os.path.exists(self.output): |         if os.path.exists(self.output): | ||||||
|             if os.path.isfile(self.output): |             if os.path.isfile(self.output): | ||||||
| @ -66,12 +70,16 @@ class WikiqTester: | |||||||
|             self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) |             self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) | ||||||
|             self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) |             self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) | ||||||
|         else: |         else: | ||||||
|             self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) |             self.wikiq_baseline_name = "{0}_{1}.{2}".format( | ||||||
|  |                 wiki, suffix, baseline_format | ||||||
|  |             ) | ||||||
|             self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) |             self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) | ||||||
| 
 | 
 | ||||||
|         # If case_name is unset, there are no relevant baseline or test files. |         # If case_name is unset, there are no relevant baseline or test files. | ||||||
|         if case_name is not None: |         if case_name is not None: | ||||||
|             self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) |             self.baseline_file = os.path.join( | ||||||
|  |                 BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name) | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|     def call_wikiq(self, *args: str, out: bool = True): |     def call_wikiq(self, *args: str, out: bool = True): | ||||||
|         """ |         """ | ||||||
| @ -81,74 +89,72 @@ class WikiqTester: | |||||||
|         :return: The output of the wikiq call. |         :return: The output of the wikiq call. | ||||||
|         """ |         """ | ||||||
|         if out: |         if out: | ||||||
|             call = ' '.join([WIKIQ, self.input_file, "-o", self.output, *args]) |             call = " ".join([WIKIQ, self.input_file, "-o", self.output, *args]) | ||||||
|         else: |         else: | ||||||
|             call = ' '.join([WIKIQ, self.input_file, *args]) |             call = " ".join([WIKIQ, self.input_file, *args]) | ||||||
| 
 | 
 | ||||||
|         print(call) |         print(call) | ||||||
|         return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) |         return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) | ||||||
| 
 | 
 | ||||||
|  |     # with / without pwr DONE | ||||||
|  |     # with / without url encode DONE | ||||||
|  |     # with / without collapse user DONE | ||||||
|  |     # with output to stdout DONE | ||||||
|  |     # note that the persistence radius is 7 by default | ||||||
|  |     # reading various file formats including | ||||||
|  |     #        7z, gz, bz2, xml  DONE | ||||||
|  |     # wikia and wikipedia data DONE | ||||||
|  |     # malformed xmls DONE | ||||||
| 
 | 
 | ||||||
| # with / without pwr DONE | def test_WP_noargs(): | ||||||
| # with / without url encode DONE |  | ||||||
| # with / without collapse user DONE |  | ||||||
| # with output to stdout DONE |  | ||||||
| # note that the persistence radius is 7 by default |  | ||||||
| # reading various file formats including |  | ||||||
| #        7z, gz, bz2, xml  DONE |  | ||||||
| # wikia and wikipedia data DONE |  | ||||||
| # malformed xmls DONE |  | ||||||
| 
 |  | ||||||
| class WikiqTestCase(unittest.TestCase): |  | ||||||
|     def test_WP_noargs(self): |  | ||||||
|     tester = WikiqTester(IKWIKI, "noargs") |     tester = WikiqTester(IKWIKI, "noargs") | ||||||
|      |      | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq() |         tester.call_wikiq() | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
|          |          | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_WP_namespaces(self): | def test_WP_namespaces(): | ||||||
|     tester = WikiqTester(IKWIKI, "namespaces") |     tester = WikiqTester(IKWIKI, "namespaces") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("-n 0", "-n 1") |         tester.call_wikiq("-n 0", "-n 1") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     # as a test let's make sure that we get equal data frames |     # as a test let's make sure that we get equal data frames | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|         num_wrong_ns = sum(~ test.namespace.isin({0, 1})) |     num_wrong_ns = sum(~test.namespace.isin({0, 1})) | ||||||
|     self.assertEqual(num_wrong_ns, 0) |     self.assertEqual(num_wrong_ns, 0) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_WP_revert_radius(self): | def test_WP_revert_radius(): | ||||||
|     tester = WikiqTester(IKWIKI, "revert_radius") |     tester = WikiqTester(IKWIKI, "revert_radius") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("-n 0", "-n 1", "-rr 1") |         tester.call_wikiq("-n 0", "-n 1", "-rr 1") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     # as a test let's make sure that we get equal data frames |     # as a test let's make sure that we get equal data frames | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|         num_wrong_ns = sum(~ test.namespace.isin({0, 1})) |     num_wrong_ns = sum(~test.namespace.isin({0, 1})) | ||||||
|     self.assertEqual(num_wrong_ns, 0) |     self.assertEqual(num_wrong_ns, 0) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_WP_no_revert_radius(self): | def test_WP_no_revert_radius(): | ||||||
|     tester = WikiqTester(IKWIKI, "no_revert_radius") |     tester = WikiqTester(IKWIKI, "no_revert_radius") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("-rr 0") |         tester.call_wikiq("-rr 0") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     # as a test let's make sure that we get equal data frames |     # as a test let's make sure that we get equal data frames | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
| @ -157,85 +163,85 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_WP_collapse_user(self): | def test_WP_collapse_user(): | ||||||
|     tester = WikiqTester(IKWIKI, "collapse_user") |     tester = WikiqTester(IKWIKI, "collapse_user") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--collapse-user") |         tester.call_wikiq("--collapse-user") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_noargs(self): | def test_noargs(): | ||||||
|     tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq() |         tester.call_wikiq() | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
|          |          | ||||||
|     def test_collapse_user(self): | def test_collapse_user(): | ||||||
|     tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--collapse-user", "--fandom-2020") |         tester.call_wikiq("--collapse-user", "--fandom-2020") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_pwr_wikidiff2(self): | def test_pwr_wikidiff2(): | ||||||
|     tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--persistence wikidiff2", "--fandom-2020") |         tester.call_wikiq("--persistence wikidiff2", "--fandom-2020") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|         test = pd.read_table(tester.output) |     test = pd.read_parquet(tester.output) | ||||||
|         baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_parquet(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_pwr_segment(self): | def test_pwr_segment(): | ||||||
|     tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--persistence segment", "--fandom-2020") |         tester.call_wikiq("--persistence segment", "--fandom-2020") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_pwr_legacy(self): | def test_pwr_legacy(): | ||||||
|     tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--persistence legacy", "--fandom-2020") |         tester.call_wikiq("--persistence legacy", "--fandom-2020") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_pwr(self): | def test_pwr(): | ||||||
|     tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") |     tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq("--persistence", "--fandom-2020") |         tester.call_wikiq("--persistence", "--fandom-2020") | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(tester.output) |     test = pd.read_table(tester.output) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
| @ -243,9 +249,39 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|     test = test.reindex(columns=sorted(test.columns)) |     test = test.reindex(columns=sorted(test.columns)) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_malformed_noargs(self): | def test_diff(): | ||||||
|  |     tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet') | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         tester.call_wikiq("--diff", "--fandom-2020") | ||||||
|  |     except subprocess.CalledProcessError as exc: | ||||||
|  |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
|  | 
 | ||||||
|  |     test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") | ||||||
|  |     baseline = pd.read_parquet(tester.baseline_file) | ||||||
|  | 
 | ||||||
|  |     test = test.reindex(columns=sorted(test.columns)) | ||||||
|  |     assert_frame_equal(test, baseline, check_like=True) | ||||||
|  | 
 | ||||||
|  | def test_text(): | ||||||
|  |     tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet') | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         tester.call_wikiq("--diff", "--text","--fandom-2020") | ||||||
|  |     except subprocess.CalledProcessError as exc: | ||||||
|  |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
|  | 
 | ||||||
|  |     test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") | ||||||
|  |     baseline = pd.read_parquet(tester.baseline_file) | ||||||
|  | 
 | ||||||
|  |     test = test.reindex(columns=sorted(test.columns)) | ||||||
|  |     assert_frame_equal(test, baseline, check_like=True) | ||||||
|  | 
 | ||||||
|  | def test_malformed_noargs(): | ||||||
|     tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z") |     tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z") | ||||||
|         want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' |     want_exception = ( | ||||||
|  |         "xml.etree.ElementTree.ParseError: no element found: line 1369, column 0" | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq() |         tester.call_wikiq() | ||||||
| @ -253,21 +289,23 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         errlines = exc.stderr.decode("utf8").splitlines() |         errlines = exc.stderr.decode("utf8").splitlines() | ||||||
|         self.assertEqual(errlines[-1], want_exception) |         self.assertEqual(errlines[-1], want_exception) | ||||||
|     else: |     else: | ||||||
|             self.fail("No exception raised, want: {}".format(want_exception)) |         pytest.fail("No exception raised, want: {}".format(want_exception)) | ||||||
| 
 | 
 | ||||||
|     def test_stdout_noargs(self): | def test_stdout_noargs(): | ||||||
|     tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") |     tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|             outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode("utf8") |         outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode( | ||||||
|  |             "utf8" | ||||||
|  |         ) | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     test = pd.read_table(StringIO(outs)) |     test = pd.read_table(StringIO(outs)) | ||||||
|     baseline = pd.read_table(tester.baseline_file) |     baseline = pd.read_table(tester.baseline_file) | ||||||
|     assert_frame_equal(test, baseline, check_like=True) |     assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_bad_regex(self): | def test_bad_regex(): | ||||||
|     tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex") |     tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex") | ||||||
| 
 | 
 | ||||||
|     # sample arguments for checking that bad arguments get terminated / test_regex_arguments |     # sample arguments for checking that bad arguments get terminated / test_regex_arguments | ||||||
| @ -280,7 +318,7 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         "-CP '(Tamil|Li)' -RPl testlabel", |         "-CP '(Tamil|Li)' -RPl testlabel", | ||||||
|         # regex is missing |         # regex is missing | ||||||
|         "-CPl testlabel", |         "-CPl testlabel", | ||||||
|             "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" |         "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'", | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     for arguments in bad_arguments_list: |     for arguments in bad_arguments_list: | ||||||
| @ -290,15 +328,15 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|             # we want to check that the bad arguments were caught and sys.exit is stopping the code |             # we want to check that the bad arguments were caught and sys.exit is stopping the code | ||||||
|             print(exc.stderr.decode("utf-8")) |             print(exc.stderr.decode("utf-8")) | ||||||
|         else: |         else: | ||||||
|                 self.fail("No exception raised, want Exception") |             pytest.fail("No exception raised, want Exception") | ||||||
| 
 | 
 | ||||||
|     def test_good_regex(self): | def test_good_regex(): | ||||||
|     # sample arguments for checking the outcomes of good arguments / test_basic_regex |     # sample arguments for checking the outcomes of good arguments / test_basic_regex | ||||||
|     good_arguments_list = [ |     good_arguments_list = [ | ||||||
|         "-RP '\\b\\d{3}\\b' -RPl threedigits", |         "-RP '\\b\\d{3}\\b' -RPl threedigits", | ||||||
|         "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", |         "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", | ||||||
|         "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", |         "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", | ||||||
|             "-CP 'WP:EVADE' -CPl wp_evade" |         "-CP 'WP:EVADE' -CPl wp_evade", | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     for i, arguments in enumerate(good_arguments_list): |     for i, arguments in enumerate(good_arguments_list): | ||||||
| @ -307,7 +345,7 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         try: |         try: | ||||||
|             tester.call_wikiq(arguments) |             tester.call_wikiq(arguments) | ||||||
|         except subprocess.CalledProcessError as exc: |         except subprocess.CalledProcessError as exc: | ||||||
|                 self.fail(exc.stderr.decode("utf8")) |             pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|         test = pd.read_table(tester.output) |         test = pd.read_table(tester.output) | ||||||
| 
 | 
 | ||||||
| @ -315,32 +353,34 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
|         print(i) |         print(i) | ||||||
| 
 | 
 | ||||||
|     def test_capturegroup_regex(self): | def test_capturegroup_regex(): | ||||||
|     cap_arguments_list = [ |     cap_arguments_list = [ | ||||||
|         "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", |         "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", | ||||||
|             "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov" |         "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov", | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
|     for i, arguments in enumerate(cap_arguments_list): |     for i, arguments in enumerate(cap_arguments_list): | ||||||
|             tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)) |         tester = WikiqTester( | ||||||
|  |             wiki=REGEXTEST, case_name="capturegroup", suffix=str(i) | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             tester.call_wikiq(arguments) |             tester.call_wikiq(arguments) | ||||||
|         except subprocess.CalledProcessError as exc: |         except subprocess.CalledProcessError as exc: | ||||||
|                 self.fail(exc.stderr.decode("utf8")) |             pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|         test = pd.read_table(tester.output) |         test = pd.read_table(tester.output) | ||||||
| 
 | 
 | ||||||
|         baseline = pd.read_table(tester.baseline_file) |         baseline = pd.read_table(tester.baseline_file) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_parquet(self): | def test_parquet(): | ||||||
|     tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") |     tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         tester.call_wikiq() |         tester.call_wikiq() | ||||||
|     except subprocess.CalledProcessError as exc: |     except subprocess.CalledProcessError as exc: | ||||||
|             self.fail(exc.stderr.decode("utf8")) |         pytest.fail(exc.stderr.decode("utf8")) | ||||||
| 
 | 
 | ||||||
|     # as a test let's make sure that we get equal data frames |     # as a test let's make sure that we get equal data frames | ||||||
|     test: DataFrame = pd.read_parquet(tester.output) |     test: DataFrame = pd.read_parquet(tester.output) | ||||||
| @ -349,28 +389,26 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|     baseline: DataFrame = pd.read_table(tester.baseline_file) |     baseline: DataFrame = pd.read_table(tester.baseline_file) | ||||||
| 
 | 
 | ||||||
|     # Pandas does not read timestamps as the desired datetime type. |     # Pandas does not read timestamps as the desired datetime type. | ||||||
|         baseline['date_time'] = pd.to_datetime(baseline['date_time']) |     baseline["date_time"] = pd.to_datetime(baseline["date_time"]) | ||||||
|     # Split strings to the arrays of reverted IDs so they can be compared. |     # Split strings to the arrays of reverted IDs so they can be compared. | ||||||
|         baseline['revert'] = baseline['revert'].replace(np.nan, None) |     baseline["revert"] = baseline["revert"].replace(np.nan, None) | ||||||
|         baseline['reverteds'] = baseline['reverteds'].replace(np.nan, None) |     baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None) | ||||||
|     # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] |     # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] | ||||||
|         baseline['sha1'] = baseline['sha1'].replace(np.nan, None) |     baseline["sha1"] = baseline["sha1"].replace(np.nan, None) | ||||||
|         baseline['editor'] = baseline['editor'].replace(np.nan, None) |     baseline["editor"] = baseline["editor"].replace(np.nan, None) | ||||||
|         baseline['anon'] = baseline['anon'].replace(np.nan, None) |     baseline["anon"] = baseline["anon"].replace(np.nan, None) | ||||||
| 
 | 
 | ||||||
|     for index, row in baseline.iterrows(): |     for index, row in baseline.iterrows(): | ||||||
|             if row['revert'] != test['revert'][index]: |         if row["revert"] != test["revert"][index]: | ||||||
|                 print(row['revid'], ":", row['revert'], "!=", test['revert'][index]) |             print(row["revid"], ":", row["revert"], "!=", test["revert"][index]) | ||||||
| 
 | 
 | ||||||
|     for col in baseline.columns: |     for col in baseline.columns: | ||||||
|         try: |         try: | ||||||
|                 assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) |             assert_series_equal( | ||||||
|  |                 test[col], baseline[col], check_like=True, check_dtype=False | ||||||
|  |             ) | ||||||
|         except ValueError as exc: |         except ValueError as exc: | ||||||
|             print(f"Error comparing column {col}") |             print(f"Error comparing column {col}") | ||||||
|                 self.fail(exc) |             pytest.fail(exc) | ||||||
| 
 | 
 | ||||||
|     # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) |     # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     unittest.main() |  | ||||||
|  | |||||||
| @ -16,10 +16,10 @@ import pywikidiff2 | |||||||
| class DiffToOperationMap: | class DiffToOperationMap: | ||||||
|     def __init__(self, diff, tokenizer): |     def __init__(self, diff, tokenizer): | ||||||
|         self.tokenizer = tokenizer |         self.tokenizer = tokenizer | ||||||
|         self.diff = json.loads(diff) |  | ||||||
|         self.from_par_move_dict = {} |         self.from_par_move_dict = {} | ||||||
|         self.to_par_move_dict = {} |         self.to_par_move_dict = {} | ||||||
|         self.highlights_without_offset = [] |         self.highlights_without_offset = [] | ||||||
|  |         self.diff = diff | ||||||
|         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. |         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. | ||||||
|         self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() |         self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
|         self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() |         self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
| @ -375,7 +375,7 @@ class WikiDiffMatcher: | |||||||
|             numContextLines=1000000, moved_paragraph_detection_cutoff=200000 |             numContextLines=1000000, moved_paragraph_detection_cutoff=200000 | ||||||
|         ) |         ) | ||||||
|         # Pre-compute diffs to reduce traffic overhead. |         # Pre-compute diffs to reduce traffic overhead. | ||||||
|         self.diffs = differ.inline_json_diff_sequence(list(texts)) |         self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))] | ||||||
|         self.tokenizer = tokenizer or TOKENIZER |         self.tokenizer = tokenizer or TOKENIZER | ||||||
| 
 | 
 | ||||||
|     class Processor(DiffEngine.Processor): |     class Processor(DiffEngine.Processor): | ||||||
|  | |||||||
							
								
								
									
										48
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										48
									
								
								wikiq
									
									
									
									
									
								
							| @ -208,6 +208,8 @@ class WikiqParser: | |||||||
|                  regex_match_comment: list[str], |                  regex_match_comment: list[str], | ||||||
|                  regex_revision_label: list[str], |                  regex_revision_label: list[str], | ||||||
|                  regex_comment_label: list[str], |                  regex_comment_label: list[str], | ||||||
|  |                  text: bool = False, | ||||||
|  |                  diff: bool = False, | ||||||
|                  collapse_user: bool = False, |                  collapse_user: bool = False, | ||||||
|                  persist: int = None, |                  persist: int = None, | ||||||
|                  namespaces: Union[list[int], None] = None, |                  namespaces: Union[list[int], None] = None, | ||||||
| @ -226,7 +228,8 @@ class WikiqParser: | |||||||
|         self.persist: int = persist |         self.persist: int = persist | ||||||
|         self.namespaces = [] |         self.namespaces = [] | ||||||
|         self.revert_radius = revert_radius |         self.revert_radius = revert_radius | ||||||
| 
 |         self.diff = diff | ||||||
|  |         self.text = text | ||||||
|         if namespaces is not None: |         if namespaces is not None: | ||||||
|             self.namespace_filter = set(namespaces) |             self.namespace_filter = set(namespaces) | ||||||
|         else: |         else: | ||||||
| @ -331,6 +334,9 @@ class WikiqParser: | |||||||
|             tables.RevisionIsAnon(), |             tables.RevisionIsAnon(), | ||||||
|         ]) |         ]) | ||||||
| 
 | 
 | ||||||
|  |         if self.text: | ||||||
|  |             table.columns.append(tables.RevisionText()) | ||||||
|  | 
 | ||||||
|         if self.collapse_user: |         if self.collapse_user: | ||||||
|             table.columns.append(tables.RevisionCollapsed()) |             table.columns.append(tables.RevisionCollapsed()) | ||||||
| 
 | 
 | ||||||
| @ -345,6 +351,10 @@ class WikiqParser: | |||||||
|         schema = table.schema() |         schema = table.schema() | ||||||
|         schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) |         schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) | ||||||
| 
 | 
 | ||||||
|  |         if self.diff: | ||||||
|  |             from diff_pyarrow_schema import diff_field | ||||||
|  |             schema = schema.append(diff_field) | ||||||
|  | 
 | ||||||
|         # Add regex fields to the schema. |         # Add regex fields to the schema. | ||||||
|         for pair in self.regex_revision_pairs: |         for pair in self.regex_revision_pairs: | ||||||
|             for field in pair.get_pyarrow_fields(): |             for field in pair.get_pyarrow_fields(): | ||||||
| @ -412,9 +422,18 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|                 revision_texts.append(rev.text) |                 revision_texts.append(rev.text) | ||||||
| 
 | 
 | ||||||
|  |             wikidiff_matcher = None | ||||||
|  |             if self.diff or self.persist == PersistMethod.wikidiff2: | ||||||
|  |                 wikidiff_matcher = WikiDiffMatcher(revision_texts, | ||||||
|  |                                                    tokenizer=wikitext_split, | ||||||
|  |                                                    ) | ||||||
|  | 
 | ||||||
|             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. |             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. | ||||||
|             row_buffer = table.pop() |             row_buffer = table.pop() | ||||||
| 
 | 
 | ||||||
|  |             if self.diff: | ||||||
|  |                 row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))] | ||||||
|  | 
 | ||||||
|             is_revert_column: list[Union[bool, None]] = [] |             is_revert_column: list[Union[bool, None]] = [] | ||||||
|             for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): |             for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): | ||||||
|                 if self.revert_radius == 0 or d: |                 if self.revert_radius == 0 or d: | ||||||
| @ -428,6 +447,7 @@ class WikiqParser: | |||||||
|                 row_buffer[k] = v |                 row_buffer[k] = v | ||||||
|                 regex_matches = {} |                 regex_matches = {} | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|             if self.persist != PersistMethod.none: |             if self.persist != PersistMethod.none: | ||||||
|                 window = deque(maxlen=PERSISTENCE_RADIUS) |                 window = deque(maxlen=PERSISTENCE_RADIUS) | ||||||
| 
 | 
 | ||||||
| @ -443,9 +463,8 @@ class WikiqParser: | |||||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), |                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 elif self.persist == PersistMethod.wikidiff2: |                 elif self.persist == PersistMethod.wikidiff2: | ||||||
|                     state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, | 
 | ||||||
|                                                                     tokenizer=wikitext_split, |                     state = mwpersistence.DiffState(wikidiff_matcher, | ||||||
|                                                                     ), |  | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 else: |                 else: | ||||||
|                     from mw.lib import persistence |                     from mw.lib import persistence | ||||||
| @ -469,8 +488,6 @@ class WikiqParser: | |||||||
|                         row_buffer['tokens_removed'].append(len(old_tokens_removed)) |                         row_buffer['tokens_removed'].append(len(old_tokens_removed)) | ||||||
|                         row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) |                         row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) | ||||||
| 
 | 
 | ||||||
|                 del row_buffer['text'] |  | ||||||
| 
 |  | ||||||
|                 # print out metadata for the last RADIUS revisions |                 # print out metadata for the last RADIUS revisions | ||||||
|                 for i, item in enumerate(window): |                 for i, item in enumerate(window): | ||||||
|                     # if the window was full, we've already printed item 0 |                     # if the window was full, we've already printed item 0 | ||||||
| @ -485,6 +502,9 @@ class WikiqParser: | |||||||
|                     row_buffer['tokens_removed'].append(len(tokens_removed)) |                     row_buffer['tokens_removed'].append(len(tokens_removed)) | ||||||
|                     row_buffer['tokens_window'].append(len(window) - (i + 1)) |                     row_buffer['tokens_window'].append(len(window) - (i + 1)) | ||||||
| 
 | 
 | ||||||
|  |                 if not self.text: | ||||||
|  |                     del row_buffer['text'] | ||||||
|  | 
 | ||||||
|             writer.write(pa.table(row_buffer, schema=schema)) |             writer.write(pa.table(row_buffer, schema=schema)) | ||||||
| 
 | 
 | ||||||
|             page_count += 1 |             page_count += 1 | ||||||
| @ -494,7 +514,6 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|         writer.close() |         writer.close() | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def match_archive_suffix(input_filename): | def match_archive_suffix(input_filename): | ||||||
|     if re.match(r'.*\.7z$', input_filename): |     if re.match(r'.*\.7z$', input_filename): | ||||||
|         cmd = ["7za", "x", "-so", input_filename] |         cmd = ["7za", "x", "-so", input_filename] | ||||||
| @ -580,6 +599,14 @@ def main(): | |||||||
|                         action='append', |                         action='append', | ||||||
|                         help="The label for the outputted column based on matching the regex in comments.") |                         help="The label for the outputted column based on matching the regex in comments.") | ||||||
| 
 | 
 | ||||||
|  |     parser.add_argument('-d', '--diff', dest="diff", default=False, | ||||||
|  |                         action='store_true', | ||||||
|  |                         help="Output a diff structure for each revision with information about changed or moved lines.") | ||||||
|  | 
 | ||||||
|  |     parser.add_argument('-t', '--text', dest="text", default=False, | ||||||
|  |                         action='store_true', | ||||||
|  |                         help="Output the text of the revision.") | ||||||
|  | 
 | ||||||
|     parser.add_argument('--fandom-2020', dest="fandom_2020", |     parser.add_argument('--fandom-2020', dest="fandom_2020", | ||||||
|                         action='store_true', |                         action='store_true', | ||||||
|                         help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") |                         help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") | ||||||
| @ -604,6 +631,7 @@ def main(): | |||||||
|     else: |     else: | ||||||
|         namespaces = None |         namespaces = None | ||||||
| 
 | 
 | ||||||
|  |     print(args, file=sys.stderr) | ||||||
|     if len(args.dumpfiles) > 0: |     if len(args.dumpfiles) > 0: | ||||||
|         for filename in args.dumpfiles: |         for filename in args.dumpfiles: | ||||||
|             input_file = open_input_file(filename, args.fandom_2020) |             input_file = open_input_file(filename, args.fandom_2020) | ||||||
| @ -637,6 +665,8 @@ def main(): | |||||||
|                                 regex_revision_label=args.regex_revision_label, |                                 regex_revision_label=args.regex_revision_label, | ||||||
|                                 regex_match_comment=args.regex_match_comment, |                                 regex_match_comment=args.regex_match_comment, | ||||||
|                                 regex_comment_label=args.regex_comment_label, |                                 regex_comment_label=args.regex_comment_label, | ||||||
|  |                                 text=args.text, | ||||||
|  |                                 diff=args.diff, | ||||||
|                                 output_parquet=output_parquet, |                                 output_parquet=output_parquet, | ||||||
|                                 ) |                                 ) | ||||||
| 
 | 
 | ||||||
| @ -656,7 +686,9 @@ def main(): | |||||||
|                             regex_match_revision=args.regex_match_revision, |                             regex_match_revision=args.regex_match_revision, | ||||||
|                             regex_revision_label=args.regex_revision_label, |                             regex_revision_label=args.regex_revision_label, | ||||||
|                             regex_match_comment=args.regex_match_comment, |                             regex_match_comment=args.regex_match_comment, | ||||||
|                             regex_comment_label=args.regex_comment_label) |                             regex_comment_label=args.regex_comment_label, | ||||||
|  |                             diff=args.diff, | ||||||
|  |                             text=args.text) | ||||||
| 
 | 
 | ||||||
|         wikiq.process() |         wikiq.process() | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user