Remove unnecessary urlencode tests
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
0d56267ae0
commit
032fec3198
@ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
baseline = pd.read_table(tester.baseline_file)
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
def test_WP_url_encode(self):
|
|
||||||
tester = WikiqTester(IKWIKI, "url-encode")
|
|
||||||
|
|
||||||
try:
|
|
||||||
tester.call_wikiq("--url-encode")
|
|
||||||
except subprocess.CalledProcessError as exc:
|
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
|
||||||
|
|
||||||
copyfile(tester.call_output, tester.test_file)
|
|
||||||
|
|
||||||
# as a test let's make sure that we get equal data frames
|
|
||||||
test = pd.read_table(tester.test_file)
|
|
||||||
baseline = pd.read_table(tester.baseline_file)
|
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
|
||||||
|
|
||||||
def test_WP_namespaces(self):
|
def test_WP_namespaces(self):
|
||||||
tester = WikiqTester(IKWIKI, "namespaces")
|
tester = WikiqTester(IKWIKI, "namespaces")
|
||||||
|
|
||||||
@ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
test = test.reindex(columns=sorted(test.columns))
|
test = test.reindex(columns=sorted(test.columns))
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
def test_url_encode(self):
|
|
||||||
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
|
||||||
|
|
||||||
try:
|
|
||||||
tester.call_wikiq("--url-encode", "--fandom-2020")
|
|
||||||
except subprocess.CalledProcessError as exc:
|
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
|
||||||
|
|
||||||
copyfile(tester.call_output, tester.test_file)
|
|
||||||
test = pd.read_table(tester.test_file)
|
|
||||||
baseline = pd.read_table(tester.baseline_file)
|
|
||||||
|
|
||||||
test = test.reindex(columns=sorted(test.columns))
|
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
|
||||||
|
|
||||||
def test_malformed_noargs(self):
|
def test_malformed_noargs(self):
|
||||||
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
|
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
|
||||||
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
|
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
19
wikiq
19
wikiq
@ -239,9 +239,6 @@ class Revision:
|
|||||||
editor: str | None = None
|
editor: str | None = None
|
||||||
anon: bool | None = None
|
anon: bool | None = None
|
||||||
|
|
||||||
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
|
|
||||||
urlencode = False
|
|
||||||
|
|
||||||
# defines pyarrow schema.
|
# defines pyarrow schema.
|
||||||
# each field in the data class needs an entry in this array.
|
# each field in the data class needs an entry in this array.
|
||||||
# the names should match and be in the same order.
|
# the names should match and be in the same order.
|
||||||
@ -333,7 +330,6 @@ class WikiqParser:
|
|||||||
regex_comment_label: list[str],
|
regex_comment_label: list[str],
|
||||||
collapse_user: bool = False,
|
collapse_user: bool = False,
|
||||||
persist: int = None,
|
persist: int = None,
|
||||||
urlencode: bool = False,
|
|
||||||
namespaces: list[int] | None = None,
|
namespaces: list[int] | None = None,
|
||||||
revert_radius: int = 15,
|
revert_radius: int = 15,
|
||||||
output_parquet: bool = True,
|
output_parquet: bool = True,
|
||||||
@ -347,7 +343,6 @@ class WikiqParser:
|
|||||||
self.collapse_user: bool = collapse_user
|
self.collapse_user: bool = collapse_user
|
||||||
self.persist: int = persist
|
self.persist: int = persist
|
||||||
self.namespaces = []
|
self.namespaces = []
|
||||||
self.urlencode: bool = urlencode
|
|
||||||
self.revert_radius = revert_radius
|
self.revert_radius = revert_radius
|
||||||
|
|
||||||
if namespaces is not None:
|
if namespaces is not None:
|
||||||
@ -383,9 +378,7 @@ class WikiqParser:
|
|||||||
# we also need to make sure that we have the right pyarrow schema
|
# we also need to make sure that we have the right pyarrow schema
|
||||||
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
|
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
|
||||||
|
|
||||||
self.revdata_type.urlencode = self.urlencode
|
self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields)
|
||||||
|
|
||||||
self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields)
|
|
||||||
|
|
||||||
# here we initialize the variables we need for output.
|
# here we initialize the variables we need for output.
|
||||||
if output_parquet is True:
|
if output_parquet is True:
|
||||||
@ -396,7 +389,7 @@ class WikiqParser:
|
|||||||
self.parquet_buffer_size = parquet_buffer_size
|
self.parquet_buffer_size = parquet_buffer_size
|
||||||
else:
|
else:
|
||||||
self.print_header = True
|
self.print_header = True
|
||||||
if output_file == sys.stdout:
|
if output_file == sys.stdout.buffer:
|
||||||
|
|
||||||
self.output_file = output_file
|
self.output_file = output_file
|
||||||
else:
|
else:
|
||||||
@ -724,9 +717,6 @@ def main():
|
|||||||
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
|
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
|
||||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
||||||
|
|
||||||
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
|
|
||||||
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
|
|
||||||
|
|
||||||
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
||||||
help="Id number of namespace to include. Can be specified more than once.")
|
help="Id number of namespace to include. Can be specified more than once.")
|
||||||
|
|
||||||
@ -792,7 +782,8 @@ def main():
|
|||||||
print("Processing file: %s" % filename, file=sys.stderr)
|
print("Processing file: %s" % filename, file=sys.stderr)
|
||||||
|
|
||||||
if args.stdout:
|
if args.stdout:
|
||||||
output_file = sys.stdout
|
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
|
||||||
|
output_file = sys.stdout.buffer
|
||||||
else:
|
else:
|
||||||
filename = os.path.join(output_dir, os.path.basename(filename))
|
filename = os.path.join(output_dir, os.path.basename(filename))
|
||||||
output_file = get_output_filename(filename, parquet=output_parquet)
|
output_file = get_output_filename(filename, parquet=output_parquet)
|
||||||
@ -801,7 +792,6 @@ def main():
|
|||||||
output_file,
|
output_file,
|
||||||
collapse_user=args.collapse_user,
|
collapse_user=args.collapse_user,
|
||||||
persist=persist,
|
persist=persist,
|
||||||
urlencode=args.urlencode,
|
|
||||||
namespaces=namespaces,
|
namespaces=namespaces,
|
||||||
revert_radius=args.revert_radius,
|
revert_radius=args.revert_radius,
|
||||||
regex_match_revision=args.regex_match_revision,
|
regex_match_revision=args.regex_match_revision,
|
||||||
@ -821,7 +811,6 @@ def main():
|
|||||||
collapse_user=args.collapse_user,
|
collapse_user=args.collapse_user,
|
||||||
persist=persist,
|
persist=persist,
|
||||||
# persist_legacy=args.persist_legacy,
|
# persist_legacy=args.persist_legacy,
|
||||||
urlencode=args.urlencode,
|
|
||||||
namespaces=namespaces,
|
namespaces=namespaces,
|
||||||
revert_radius=args.revert_radius,
|
revert_radius=args.revert_radius,
|
||||||
regex_match_revision=args.regex_match_revision,
|
regex_match_revision=args.regex_match_revision,
|
||||||
|
Loading…
Reference in New Issue
Block a user