Remove unnecessary urlencode tests
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									0d56267ae0
								
							
						
					
					
						commit
						032fec3198
					
				| @ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         baseline = pd.read_table(tester.baseline_file) |         baseline = pd.read_table(tester.baseline_file) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_WP_url_encode(self): |  | ||||||
|         tester = WikiqTester(IKWIKI, "url-encode") |  | ||||||
| 
 |  | ||||||
|         try: |  | ||||||
|             tester.call_wikiq("--url-encode") |  | ||||||
|         except subprocess.CalledProcessError as exc: |  | ||||||
|             self.fail(exc.stderr.decode("utf8")) |  | ||||||
| 
 |  | ||||||
|         copyfile(tester.call_output, tester.test_file) |  | ||||||
| 
 |  | ||||||
|         # as a test let's make sure that we get equal data frames |  | ||||||
|         test = pd.read_table(tester.test_file) |  | ||||||
|         baseline = pd.read_table(tester.baseline_file) |  | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |  | ||||||
| 
 |  | ||||||
|     def test_WP_namespaces(self): |     def test_WP_namespaces(self): | ||||||
|         tester = WikiqTester(IKWIKI, "namespaces") |         tester = WikiqTester(IKWIKI, "namespaces") | ||||||
| 
 | 
 | ||||||
| @ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase): | |||||||
|         test = test.reindex(columns=sorted(test.columns)) |         test = test.reindex(columns=sorted(test.columns)) | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |         assert_frame_equal(test, baseline, check_like=True) | ||||||
| 
 | 
 | ||||||
|     def test_url_encode(self): |  | ||||||
|         tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") |  | ||||||
| 
 |  | ||||||
|         try: |  | ||||||
|             tester.call_wikiq("--url-encode", "--fandom-2020") |  | ||||||
|         except subprocess.CalledProcessError as exc: |  | ||||||
|             self.fail(exc.stderr.decode("utf8")) |  | ||||||
| 
 |  | ||||||
|         copyfile(tester.call_output, tester.test_file) |  | ||||||
|         test = pd.read_table(tester.test_file) |  | ||||||
|         baseline = pd.read_table(tester.baseline_file) |  | ||||||
| 
 |  | ||||||
|         test = test.reindex(columns=sorted(test.columns)) |  | ||||||
|         assert_frame_equal(test, baseline, check_like=True) |  | ||||||
| 
 |  | ||||||
|     def test_malformed_noargs(self): |     def test_malformed_noargs(self): | ||||||
|         tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") |         tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") | ||||||
|         want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' |         want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										19
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								wikiq
									
									
									
									
									
								
							| @ -239,9 +239,6 @@ class Revision: | |||||||
|     editor: str | None = None |     editor: str | None = None | ||||||
|     anon: bool | None = None |     anon: bool | None = None | ||||||
| 
 | 
 | ||||||
|     # toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation |  | ||||||
|     urlencode = False |  | ||||||
| 
 |  | ||||||
|     # defines pyarrow schema. |     # defines pyarrow schema. | ||||||
|     # each field in the data class needs an entry in this array. |     # each field in the data class needs an entry in this array. | ||||||
|     # the names should match and be in the same order. |     # the names should match and be in the same order. | ||||||
| @ -333,7 +330,6 @@ class WikiqParser: | |||||||
|                  regex_comment_label: list[str], |                  regex_comment_label: list[str], | ||||||
|                  collapse_user: bool = False, |                  collapse_user: bool = False, | ||||||
|                  persist: int = None, |                  persist: int = None, | ||||||
|                  urlencode: bool = False, |  | ||||||
|                  namespaces: list[int] | None = None, |                  namespaces: list[int] | None = None, | ||||||
|                  revert_radius: int = 15, |                  revert_radius: int = 15, | ||||||
|                  output_parquet: bool = True, |                  output_parquet: bool = True, | ||||||
| @ -347,7 +343,6 @@ class WikiqParser: | |||||||
|         self.collapse_user: bool = collapse_user |         self.collapse_user: bool = collapse_user | ||||||
|         self.persist: int = persist |         self.persist: int = persist | ||||||
|         self.namespaces = [] |         self.namespaces = [] | ||||||
|         self.urlencode: bool = urlencode |  | ||||||
|         self.revert_radius = revert_radius |         self.revert_radius = revert_radius | ||||||
| 
 | 
 | ||||||
|         if namespaces is not None: |         if namespaces is not None: | ||||||
| @ -383,9 +378,7 @@ class WikiqParser: | |||||||
|         # we also need to make sure that we have the right pyarrow schema |         # we also need to make sure that we have the right pyarrow schema | ||||||
|         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas |         self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas | ||||||
| 
 | 
 | ||||||
|         self.revdata_type.urlencode = self.urlencode |         self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields) | ||||||
| 
 |  | ||||||
|         self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields) |  | ||||||
| 
 | 
 | ||||||
|         # here we initialize the variables we need for output. |         # here we initialize the variables we need for output. | ||||||
|         if output_parquet is True: |         if output_parquet is True: | ||||||
| @ -396,7 +389,7 @@ class WikiqParser: | |||||||
|             self.parquet_buffer_size = parquet_buffer_size |             self.parquet_buffer_size = parquet_buffer_size | ||||||
|         else: |         else: | ||||||
|             self.print_header = True |             self.print_header = True | ||||||
|             if output_file == sys.stdout: |             if output_file == sys.stdout.buffer: | ||||||
| 
 | 
 | ||||||
|                 self.output_file = output_file |                 self.output_file = output_file | ||||||
|             else: |             else: | ||||||
| @ -724,9 +717,6 @@ def main(): | |||||||
|                         choices=['', 'segment', 'sequence', 'legacy'], nargs='?', |                         choices=['', 'segment', 'sequence', 'legacy'], nargs='?', | ||||||
|                         help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") |                         help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", |  | ||||||
|                         help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") |  | ||||||
| 
 |  | ||||||
|     parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', |     parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', | ||||||
|                         help="Id number of namespace to include. Can be specified more than once.") |                         help="Id number of namespace to include. Can be specified more than once.") | ||||||
| 
 | 
 | ||||||
| @ -792,7 +782,8 @@ def main(): | |||||||
|             print("Processing file: %s" % filename, file=sys.stderr) |             print("Processing file: %s" % filename, file=sys.stderr) | ||||||
| 
 | 
 | ||||||
|             if args.stdout: |             if args.stdout: | ||||||
|                 output_file = sys.stdout |                 # Parquet libraries need a binary output, so just sys.stdout doesn't work. | ||||||
|  |                 output_file = sys.stdout.buffer | ||||||
|             else: |             else: | ||||||
|                 filename = os.path.join(output_dir, os.path.basename(filename)) |                 filename = os.path.join(output_dir, os.path.basename(filename)) | ||||||
|                 output_file = get_output_filename(filename, parquet=output_parquet) |                 output_file = get_output_filename(filename, parquet=output_parquet) | ||||||
| @ -801,7 +792,6 @@ def main(): | |||||||
|                                 output_file, |                                 output_file, | ||||||
|                                 collapse_user=args.collapse_user, |                                 collapse_user=args.collapse_user, | ||||||
|                                 persist=persist, |                                 persist=persist, | ||||||
|                                 urlencode=args.urlencode, |  | ||||||
|                                 namespaces=namespaces, |                                 namespaces=namespaces, | ||||||
|                                 revert_radius=args.revert_radius, |                                 revert_radius=args.revert_radius, | ||||||
|                                 regex_match_revision=args.regex_match_revision, |                                 regex_match_revision=args.regex_match_revision, | ||||||
| @ -821,7 +811,6 @@ def main(): | |||||||
|                             collapse_user=args.collapse_user, |                             collapse_user=args.collapse_user, | ||||||
|                             persist=persist, |                             persist=persist, | ||||||
|                             # persist_legacy=args.persist_legacy, |                             # persist_legacy=args.persist_legacy, | ||||||
|                             urlencode=args.urlencode, |  | ||||||
|                             namespaces=namespaces, |                             namespaces=namespaces, | ||||||
|                             revert_radius=args.revert_radius, |                             revert_radius=args.revert_radius, | ||||||
|                             regex_match_revision=args.regex_match_revision, |                             regex_match_revision=args.regex_match_revision, | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user