Remove unnecessary urlencode tests

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-30 13:20:10 -05:00
parent 0d56267ae0
commit 032fec3198
4 changed files with 4 additions and 32477 deletions

View File

@ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode")
try:
tester.call_wikiq("--url-encode")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_namespaces(self): def test_WP_namespaces(self):
tester = WikiqTester(IKWIKI, "namespaces") tester = WikiqTester(IKWIKI, "namespaces")
@ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase):
test = test.reindex(columns=sorted(test.columns)) test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_url_encode(self):
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
try:
tester.call_wikiq("--url-encode", "--fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
def test_malformed_noargs(self): def test_malformed_noargs(self):
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'

File diff suppressed because it is too large Load Diff

19
wikiq
View File

@ -239,9 +239,6 @@ class Revision:
editor: str | None = None editor: str | None = None
anon: bool | None = None anon: bool | None = None
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
urlencode = False
# defines pyarrow schema. # defines pyarrow schema.
# each field in the data class needs an entry in this array. # each field in the data class needs an entry in this array.
# the names should match and be in the same order. # the names should match and be in the same order.
@ -333,7 +330,6 @@ class WikiqParser:
regex_comment_label: list[str], regex_comment_label: list[str],
collapse_user: bool = False, collapse_user: bool = False,
persist: int = None, persist: int = None,
urlencode: bool = False,
namespaces: list[int] | None = None, namespaces: list[int] | None = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
@ -347,7 +343,6 @@ class WikiqParser:
self.collapse_user: bool = collapse_user self.collapse_user: bool = collapse_user
self.persist: int = persist self.persist: int = persist
self.namespaces = [] self.namespaces = []
self.urlencode: bool = urlencode
self.revert_radius = revert_radius self.revert_radius = revert_radius
if namespaces is not None: if namespaces is not None:
@ -383,9 +378,7 @@ class WikiqParser:
# we also need to make sure that we have the right pyarrow schema # we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
self.revdata_type.urlencode = self.urlencode self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields)
self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields)
# here we initialize the variables we need for output. # here we initialize the variables we need for output.
if output_parquet is True: if output_parquet is True:
@ -396,7 +389,7 @@ class WikiqParser:
self.parquet_buffer_size = parquet_buffer_size self.parquet_buffer_size = parquet_buffer_size
else: else:
self.print_header = True self.print_header = True
if output_file == sys.stdout: if output_file == sys.stdout.buffer:
self.output_file = output_file self.output_file = output_file
else: else:
@ -724,9 +717,6 @@ def main():
choices=['', 'segment', 'sequence', 'legacy'], nargs='?', choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
help="Id number of namespace to include. Can be specified more than once.") help="Id number of namespace to include. Can be specified more than once.")
@ -792,7 +782,8 @@ def main():
print("Processing file: %s" % filename, file=sys.stderr) print("Processing file: %s" % filename, file=sys.stderr)
if args.stdout: if args.stdout:
output_file = sys.stdout # Parquet libraries need a binary output, so just sys.stdout doesn't work.
output_file = sys.stdout.buffer
else: else:
filename = os.path.join(output_dir, os.path.basename(filename)) filename = os.path.join(output_dir, os.path.basename(filename))
output_file = get_output_filename(filename, parquet=output_parquet) output_file = get_output_filename(filename, parquet=output_parquet)
@ -801,7 +792,6 @@ def main():
output_file, output_file,
collapse_user=args.collapse_user, collapse_user=args.collapse_user,
persist=persist, persist=persist,
urlencode=args.urlencode,
namespaces=namespaces, namespaces=namespaces,
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision, regex_match_revision=args.regex_match_revision,
@ -821,7 +811,6 @@ def main():
collapse_user=args.collapse_user, collapse_user=args.collapse_user,
persist=persist, persist=persist,
# persist_legacy=args.persist_legacy, # persist_legacy=args.persist_legacy,
urlencode=args.urlencode,
namespaces=namespaces, namespaces=namespaces,
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision, regex_match_revision=args.regex_match_revision,