Remove unnecessary urlencode tests
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									0d56267ae0
								
							
						
					
					
						commit
						032fec3198
					
				| @ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_WP_url_encode(self): | ||||
|         tester = WikiqTester(IKWIKI, "url-encode") | ||||
| 
 | ||||
|         try: | ||||
|             tester.call_wikiq("--url-encode") | ||||
|         except subprocess.CalledProcessError as exc: | ||||
|             self.fail(exc.stderr.decode("utf8")) | ||||
| 
 | ||||
|         copyfile(tester.call_output, tester.test_file) | ||||
| 
 | ||||
|         # as a test let's make sure that we get equal data frames | ||||
|         test = pd.read_table(tester.test_file) | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_WP_namespaces(self): | ||||
|         tester = WikiqTester(IKWIKI, "namespaces") | ||||
| 
 | ||||
| @ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase): | ||||
|         test = test.reindex(columns=sorted(test.columns)) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_url_encode(self): | ||||
|         tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") | ||||
| 
 | ||||
|         try: | ||||
|             tester.call_wikiq("--url-encode", "--fandom-2020") | ||||
|         except subprocess.CalledProcessError as exc: | ||||
|             self.fail(exc.stderr.decode("utf8")) | ||||
| 
 | ||||
|         copyfile(tester.call_output, tester.test_file) | ||||
|         test = pd.read_table(tester.test_file) | ||||
|         baseline = pd.read_table(tester.baseline_file) | ||||
| 
 | ||||
|         test = test.reindex(columns=sorted(test.columns)) | ||||
|         assert_frame_equal(test, baseline, check_like=True) | ||||
| 
 | ||||
|     def test_malformed_noargs(self): | ||||
|         tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") | ||||
|         want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										19
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								wikiq
									
									
									
									
									
								
							| @ -239,9 +239,6 @@ class Revision: | ||||
|     editor: str | None = None | ||||
|     anon: bool | None = None | ||||
| 
 | ||||
|     # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation | ||||
|     urlencode = False | ||||
| 
 | ||||
|     # defines pyarrow schema. | ||||
|     # each field in the data class needs an entry in this array. | ||||
|     # the names should match and be in the same order. | ||||
| @ -333,7 +330,6 @@ class WikiqParser: | ||||
|                  regex_comment_label: list[str], | ||||
|                  collapse_user: bool = False, | ||||
|                  persist: int = None, | ||||
|                  urlencode: bool = False, | ||||
|                  namespaces: list[int] | None = None, | ||||
|                  revert_radius: int = 15, | ||||
|                  output_parquet: bool = True, | ||||
| @ -347,7 +343,6 @@ class WikiqParser: | ||||
|         self.collapse_user: bool = collapse_user | ||||
|         self.persist: int = persist | ||||
|         self.namespaces = [] | ||||
|         self.urlencode: bool = urlencode | ||||
|         self.revert_radius = revert_radius | ||||
| 
 | ||||
|         if namespaces is not None: | ||||
| @ -383,9 +378,7 @@ class WikiqParser: | ||||
|         # we also need to make sure that we have the right pyarrow schema | ||||
|         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas | ||||
| 
 | ||||
|         self.revdata_type.urlencode = self.urlencode | ||||
| 
 | ||||
|         self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields) | ||||
|         self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields) | ||||
| 
 | ||||
|         # here we initialize the variables we need for output. | ||||
|         if output_parquet is True: | ||||
| @ -396,7 +389,7 @@ class WikiqParser: | ||||
|             self.parquet_buffer_size = parquet_buffer_size | ||||
|         else: | ||||
|             self.print_header = True | ||||
|             if output_file == sys.stdout: | ||||
|             if output_file == sys.stdout.buffer: | ||||
| 
 | ||||
|                 self.output_file = output_file | ||||
|             else: | ||||
| @ -724,9 +717,6 @@ def main(): | ||||
|                         choices=['', 'segment', 'sequence', 'legacy'], nargs='?', | ||||
|                         help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") | ||||
| 
 | ||||
|     parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", | ||||
|                         help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") | ||||
| 
 | ||||
|     parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', | ||||
|                         help="Id number of namespace to include. Can be specified more than once.") | ||||
| 
 | ||||
| @ -792,7 +782,8 @@ def main(): | ||||
|             print("Processing file: %s" % filename, file=sys.stderr) | ||||
| 
 | ||||
|             if args.stdout: | ||||
|                 output_file = sys.stdout | ||||
|                 # Parquet libraries need a binary output, so just sys.stdout doesn't work. | ||||
|                 output_file = sys.stdout.buffer | ||||
|             else: | ||||
|                 filename = os.path.join(output_dir, os.path.basename(filename)) | ||||
|                 output_file = get_output_filename(filename, parquet=output_parquet) | ||||
| @ -801,7 +792,6 @@ def main(): | ||||
|                                 output_file, | ||||
|                                 collapse_user=args.collapse_user, | ||||
|                                 persist=persist, | ||||
|                                 urlencode=args.urlencode, | ||||
|                                 namespaces=namespaces, | ||||
|                                 revert_radius=args.revert_radius, | ||||
|                                 regex_match_revision=args.regex_match_revision, | ||||
| @ -821,7 +811,6 @@ def main(): | ||||
|                             collapse_user=args.collapse_user, | ||||
|                             persist=persist, | ||||
|                             # persist_legacy=args.persist_legacy, | ||||
|                             urlencode=args.urlencode, | ||||
|                             namespaces=namespaces, | ||||
|                             revert_radius=args.revert_radius, | ||||
|                             regex_match_revision=args.regex_match_revision, | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user