Remove unnecessary urlencode tests
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
0d56267ae0
commit
032fec3198
@ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase):
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_WP_url_encode(self):
|
||||
tester = WikiqTester(IKWIKI, "url-encode")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--url-encode")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
|
||||
# as a test let's make sure that we get equal data frames
|
||||
test = pd.read_table(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_WP_namespaces(self):
|
||||
tester = WikiqTester(IKWIKI, "namespaces")
|
||||
|
||||
@ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase):
|
||||
test = test.reindex(columns=sorted(test.columns))
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_url_encode(self):
|
||||
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--url-encode", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
test = pd.read_table(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
|
||||
test = test.reindex(columns=sorted(test.columns))
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_malformed_noargs(self):
|
||||
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
|
||||
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
19
wikiq
19
wikiq
@ -239,9 +239,6 @@ class Revision:
|
||||
editor: str | None = None
|
||||
anon: bool | None = None
|
||||
|
||||
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
|
||||
urlencode = False
|
||||
|
||||
# defines pyarrow schema.
|
||||
# each field in the data class needs an entry in this array.
|
||||
# the names should match and be in the same order.
|
||||
@ -333,7 +330,6 @@ class WikiqParser:
|
||||
regex_comment_label: list[str],
|
||||
collapse_user: bool = False,
|
||||
persist: int = None,
|
||||
urlencode: bool = False,
|
||||
namespaces: list[int] | None = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
@ -347,7 +343,6 @@ class WikiqParser:
|
||||
self.collapse_user: bool = collapse_user
|
||||
self.persist: int = persist
|
||||
self.namespaces = []
|
||||
self.urlencode: bool = urlencode
|
||||
self.revert_radius = revert_radius
|
||||
|
||||
if namespaces is not None:
|
||||
@ -383,9 +378,7 @@ class WikiqParser:
|
||||
# we also need to make sure that we have the right pyarrow schema
|
||||
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
|
||||
|
||||
self.revdata_type.urlencode = self.urlencode
|
||||
|
||||
self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields)
|
||||
self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields)
|
||||
|
||||
# here we initialize the variables we need for output.
|
||||
if output_parquet is True:
|
||||
@ -396,7 +389,7 @@ class WikiqParser:
|
||||
self.parquet_buffer_size = parquet_buffer_size
|
||||
else:
|
||||
self.print_header = True
|
||||
if output_file == sys.stdout:
|
||||
if output_file == sys.stdout.buffer:
|
||||
|
||||
self.output_file = output_file
|
||||
else:
|
||||
@ -724,9 +717,6 @@ def main():
|
||||
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
|
||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
||||
|
||||
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
|
||||
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
|
||||
|
||||
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
||||
help="Id number of namespace to include. Can be specified more than once.")
|
||||
|
||||
@ -792,7 +782,8 @@ def main():
|
||||
print("Processing file: %s" % filename, file=sys.stderr)
|
||||
|
||||
if args.stdout:
|
||||
output_file = sys.stdout
|
||||
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
|
||||
output_file = sys.stdout.buffer
|
||||
else:
|
||||
filename = os.path.join(output_dir, os.path.basename(filename))
|
||||
output_file = get_output_filename(filename, parquet=output_parquet)
|
||||
@ -801,7 +792,6 @@ def main():
|
||||
output_file,
|
||||
collapse_user=args.collapse_user,
|
||||
persist=persist,
|
||||
urlencode=args.urlencode,
|
||||
namespaces=namespaces,
|
||||
revert_radius=args.revert_radius,
|
||||
regex_match_revision=args.regex_match_revision,
|
||||
@ -821,7 +811,6 @@ def main():
|
||||
collapse_user=args.collapse_user,
|
||||
persist=persist,
|
||||
# persist_legacy=args.persist_legacy,
|
||||
urlencode=args.urlencode,
|
||||
namespaces=namespaces,
|
||||
revert_radius=args.revert_radius,
|
||||
regex_match_revision=args.regex_match_revision,
|
||||
|
Loading…
Reference in New Issue
Block a user