Move main logic to main()
This avoids: 1) the main function running when sourcing the file 2) Creating many globally-scoped variables in the main logic Also begin refactor of test output file logic Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									6d133575c7
								
							
						
					
					
						commit
						3d0bf89938
					
				| @ -22,9 +22,6 @@ tracemalloc.start() | |||||||
| 
 | 
 | ||||||
| class Test_Wikipedia(unittest.TestCase): | class Test_Wikipedia(unittest.TestCase): | ||||||
|     def setUp(self): |     def setUp(self): | ||||||
|         if not os.path.exists("test_output"): |  | ||||||
|             os.mkdir("test_output") |  | ||||||
| 
 |  | ||||||
|         self.wiki = 'ikwiki-20180301-pages-meta-history' |         self.wiki = 'ikwiki-20180301-pages-meta-history' | ||||||
|         self.wikiq_out_name = self.wiki + ".tsv" |         self.wikiq_out_name = self.wiki + ".tsv" | ||||||
|         self.test_output_dir = os.path.join(".", "test_output") |         self.test_output_dir = os.path.join(".", "test_output") | ||||||
| @ -109,9 +106,6 @@ class Test_Wikipedia(unittest.TestCase): | |||||||
| class Test_Basic(unittest.TestCase): | class Test_Basic(unittest.TestCase): | ||||||
| 
 | 
 | ||||||
|     def setUp(self): |     def setUp(self): | ||||||
|         if not os.path.exists("test_output"): |  | ||||||
|             os.mkdir("test_output") |  | ||||||
| 
 |  | ||||||
|         self.wiki = 'sailormoon' |         self.wiki = 'sailormoon' | ||||||
|         self.wikiq_out_name = self.wiki + ".tsv" |         self.wikiq_out_name = self.wiki + ".tsv" | ||||||
|         self.test_output_dir = os.path.join(".", "test_output") |         self.test_output_dir = os.path.join(".", "test_output") | ||||||
| @ -318,14 +312,14 @@ class Test_Regex(unittest.TestCase): | |||||||
| 
 | 
 | ||||||
|         self.test_output_dir = os.path.join(".", "test_output") |         self.test_output_dir = os.path.join(".", "test_output") | ||||||
|         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) |         self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) | ||||||
|         # we have two base calls, one for checking inputs and the other for checking outputs |         # we have two base calls, one for checking arguments and the other for checking outputs | ||||||
|         self.base_call = "../wikiq {0}" |         self.base_call = "../wikiq {0}" | ||||||
|         self.base_call_outs = "../wikiq {0} -o {1}" |         self.base_call_outs = "../wikiq {0} -o {1}" | ||||||
| 
 | 
 | ||||||
|         self.baseline_output_dir = "baseline_output" |         self.baseline_output_dir = "baseline_output" | ||||||
| 
 | 
 | ||||||
|         # sample inputs for checking that bad inputs get terminated / test_regex_inputs |         # sample arguments for checking that bad arguments get terminated / test_regex_arguments | ||||||
|         self.bad_inputs_list = [ |         self.bad_arguments_list = [ | ||||||
|             # label is missing |             # label is missing | ||||||
|             "-RP '\\b\\d+\\b'", |             "-RP '\\b\\d+\\b'", | ||||||
|             # number of reg and number of labels do not match |             # number of reg and number of labels do not match | ||||||
| @ -337,33 +331,33 @@ class Test_Regex(unittest.TestCase): | |||||||
|             "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" |             "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         # sample inputs for checking the outcomes of good inputs / test_basic_regex |         # sample arguments for checking the outcomes of good arguments / test_basic_regex | ||||||
|         self.good_inputs_list = [ |         self.good_arguments_list = [ | ||||||
|             "-RP '\\b\\d{3}\\b' -RPl threedigits", |             "-RP '\\b\\d{3}\\b' -RPl threedigits", | ||||||
|             "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", |             "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", | ||||||
|             "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", |             "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", | ||||||
|             "-CP 'WP:EVADE' -CPl wp_evade" |             "-CP 'WP:EVADE' -CPl wp_evade" | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         self.cap_inputs_list = [ |         self.cap_arguments_list = [ | ||||||
|             "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", |             "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", | ||||||
|             "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov" |             "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov" | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|     def test_regex_inputs(self): |     def test_regex_arguments(self): | ||||||
|         for input in self.bad_inputs_list: |         for arguments in self.bad_arguments_list: | ||||||
|             call = self.base_call.format(self.input_file) |             call = self.base_call.format(self.input_file) | ||||||
|             call = call + " --stdout " + input |             call = call + " --stdout " + arguments | ||||||
|             print(call) |             print(call) | ||||||
|             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: |             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: | ||||||
|                 stdout, stderr = proc.communicate() |                 stdout, stderr = proc.communicate() | ||||||
|                 # we want to check that the bad inputs were caught and sys.exit is stopping the code |                 # we want to check that the bad arguments were caught and sys.exit is stopping the code | ||||||
|                 print(stderr.decode("utf-8")) |                 print(stderr.decode("utf-8")) | ||||||
| 
 | 
 | ||||||
|                 self.assertNotEqual(proc.returncode, 0) |                 self.assertNotEqual(proc.returncode, 0) | ||||||
| 
 | 
 | ||||||
|     def test_basic_regex(self): |     def test_basic_regex(self): | ||||||
|         for i, input in enumerate(self.good_inputs_list): |         for i, arguments in enumerate(self.good_arguments_list): | ||||||
| 
 | 
 | ||||||
|             test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) |             test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) | ||||||
|             # print(test_filename) |             # print(test_filename) | ||||||
| @ -372,7 +366,7 @@ class Test_Regex(unittest.TestCase): | |||||||
|                 os.remove(test_file) |                 os.remove(test_file) | ||||||
| 
 | 
 | ||||||
|             call = self.base_call_outs.format(self.input_file, self.test_output_dir) |             call = self.base_call_outs.format(self.input_file, self.test_output_dir) | ||||||
|             call = call + " " + input |             call = call + " " + arguments | ||||||
|             print(call) |             print(call) | ||||||
|             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: |             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: | ||||||
|                 proc.wait() |                 proc.wait() | ||||||
| @ -388,7 +382,7 @@ class Test_Regex(unittest.TestCase): | |||||||
|             print(i) |             print(i) | ||||||
| 
 | 
 | ||||||
|     def test_capturegroup_regex(self): |     def test_capturegroup_regex(self): | ||||||
|         for i, input in enumerate(self.cap_inputs_list): |         for i, arguments in enumerate(self.cap_arguments_list): | ||||||
|             test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) |             test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) | ||||||
|             print(test_filename) |             print(test_filename) | ||||||
|             test_file = os.path.join(self.test_output_dir, test_filename) |             test_file = os.path.join(self.test_output_dir, test_filename) | ||||||
| @ -396,10 +390,9 @@ class Test_Regex(unittest.TestCase): | |||||||
|                 os.remove(test_file) |                 os.remove(test_file) | ||||||
| 
 | 
 | ||||||
|             call = self.base_call_outs.format(self.input_file, self.test_output_dir) |             call = self.base_call_outs.format(self.input_file, self.test_output_dir) | ||||||
|             call = call + " " + input |             call = call + " " + arguments | ||||||
|             print(call) |             print(call) | ||||||
| 
 | 
 | ||||||
|             print(call) |  | ||||||
|             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: |             with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: | ||||||
|                 proc.wait() |                 proc.wait() | ||||||
|                 assert (proc.returncode == 0) |                 assert (proc.returncode == 0) | ||||||
| @ -414,4 +407,14 @@ class Test_Regex(unittest.TestCase): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|  |     # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. | ||||||
|  |     if not os.path.exists("test_output"): | ||||||
|  |         os.mkdir("test_output") | ||||||
|  |     else: | ||||||
|  |         # Avoid subsequent calls to tests interfering with each other. | ||||||
|  |         # Otherwise, a test may erroneously pass if the program has no output | ||||||
|  |         # but a previous run output what was expected. | ||||||
|  |         for f in os.listdir("test_output"): | ||||||
|  |             os.remove(os.path.join("test_output", f)) | ||||||
|  | 
 | ||||||
|     unittest.main() |     unittest.main() | ||||||
|  | |||||||
							
								
								
									
										32
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								wikiq
									
									
									
									
									
								
							| @ -44,7 +44,7 @@ def calculate_persistence(tokens_added): | |||||||
|             len(tokens_added)) |             len(tokens_added)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class WikiqIterator(): | class WikiqIterator: | ||||||
|     def __init__(self, fh, collapse_user=False): |     def __init__(self, fh, collapse_user=False): | ||||||
|         self.fh = fh |         self.fh = fh | ||||||
|         self.collapse_user = collapse_user |         self.collapse_user = collapse_user | ||||||
| @ -66,7 +66,7 @@ class WikiqIterator(): | |||||||
|         return next(self.__pages) |         return next(self.__pages) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class WikiqPage(): | class WikiqPage: | ||||||
|     __slots__ = ('id', 'title', 'namespace', 'redirect', |     __slots__ = ('id', 'title', 'namespace', 'redirect', | ||||||
|                  'restrictions', 'mwpage', '__revisions', |                  'restrictions', 'mwpage', '__revisions', | ||||||
|                  'collapse_user') |                  'collapse_user') | ||||||
| @ -164,7 +164,7 @@ class RegexPair(object): | |||||||
|         return fields |         return fields | ||||||
| 
 | 
 | ||||||
|     def _make_key(self, cap_group): |     def _make_key(self, cap_group): | ||||||
|         return ("{}_{}".format(self.label, cap_group)) |         return "{}_{}".format(self.label, cap_group) | ||||||
| 
 | 
 | ||||||
|     def matchmake(self, content, rev_data): |     def matchmake(self, content, rev_data): | ||||||
| 
 | 
 | ||||||
| @ -182,7 +182,7 @@ class RegexPair(object): | |||||||
|                     temp_list = [] |                     temp_list = [] | ||||||
|                     for match in matchobjects: |                     for match in matchobjects: | ||||||
|                         # we only want to add the match for the capture group if the match is not None |                         # we only want to add the match for the capture group if the match is not None | ||||||
|                         if match.group(cap_group) != None: |                         if match.group(cap_group) is not None: | ||||||
|                             temp_list.append(match.group(cap_group)) |                             temp_list.append(match.group(cap_group)) | ||||||
| 
 | 
 | ||||||
|                     # if temp_list of matches is empty just make that column None |                     # if temp_list of matches is empty just make that column None | ||||||
| @ -234,7 +234,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass() | @dataclass() | ||||||
| class RevDataBase(): | class RevDataBase: | ||||||
|     revid: int |     revid: int | ||||||
|     date_time: datetime |     date_time: datetime | ||||||
|     articleid: int |     articleid: int | ||||||
| @ -358,7 +358,7 @@ class RevDataPersistence(RevDataBase): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields. | class RevDataCollapsePersistence uses multiple inheritance to make a class that has both persistence and collapse fields. | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| @ -446,7 +446,7 @@ class WikiqParser: | |||||||
|                 result.append(rp) |                 result.append(rp) | ||||||
|                 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() |                 self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() | ||||||
|             return result |             return result | ||||||
|         elif (patterns is None and labels is None): |         elif (patterns is None) and (labels is None): | ||||||
|             return [] |             return [] | ||||||
|         else: |         else: | ||||||
|             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') |             sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') | ||||||
| @ -580,7 +580,7 @@ class WikiqParser: | |||||||
| 
 | 
 | ||||||
|                 # TODO missing: additions_size deletions_size |                 # TODO missing: additions_size deletions_size | ||||||
| 
 | 
 | ||||||
|                 # if collapse user was on, lets run that |                 # if collapse user was on, let's run that | ||||||
|                 if self.collapse_user: |                 if self.collapse_user: | ||||||
|                     rev_data.collapsed_revs = rev.collapsed_revs |                     rev_data.collapsed_revs = rev.collapsed_revs | ||||||
| 
 | 
 | ||||||
| @ -704,6 +704,7 @@ class WikiqParser: | |||||||
|         line = rev_data.to_tsv_row() |         line = rev_data.to_tsv_row() | ||||||
|         print(line, file=self.output_file) |         print(line, file=self.output_file) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def open_input_file(input_filename): | def open_input_file(input_filename): | ||||||
|     if re.match(r'.*\.7z$', input_filename): |     if re.match(r'.*\.7z$', input_filename): | ||||||
|         cmd = ["7za", "x", "-so", input_filename, "*.xml"] |         cmd = ["7za", "x", "-so", input_filename, "*.xml"] | ||||||
| @ -719,6 +720,7 @@ def open_input_file(input_filename): | |||||||
|     except NameError: |     except NameError: | ||||||
|         return open(input_filename, 'r') |         return open(input_filename, 'r') | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def get_output_filename(input_filename, parquet=False): | def get_output_filename(input_filename, parquet=False): | ||||||
|     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) |     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) | ||||||
|     output_filename = re.sub(r'\.xml', '', output_filename) |     output_filename = re.sub(r'\.xml', '', output_filename) | ||||||
| @ -736,7 +738,8 @@ def open_output_file(input_filename): | |||||||
|     return output_file |     return output_file | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') | def main(): | ||||||
|  |     parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimited data.') | ||||||
| 
 | 
 | ||||||
|     # arguments for the input direction |     # arguments for the input direction | ||||||
|     parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, |     parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, | ||||||
| @ -753,13 +756,13 @@ parser.add_argument('--collapse-user', dest="collapse_user", action="store_true" | |||||||
| 
 | 
 | ||||||
|     parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, |     parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, | ||||||
|                         choices=['', 'segment', 'sequence', 'legacy'], nargs='?', |                         choices=['', 'segment', 'sequence', 'legacy'], nargs='?', | ||||||
|                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") |                         help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", |     parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", | ||||||
|                         help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") |                         help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', |     parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', | ||||||
|                     help="Id number of namspace to include. Can be specified more than once.") |                         help="Id number of namespace to include. Can be specified more than once.") | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('-rr', |     parser.add_argument('-rr', | ||||||
|                         '--revert-radius', |                         '--revert-radius', | ||||||
| @ -769,7 +772,8 @@ parser.add_argument('-rr', | |||||||
|                         default=15, |                         default=15, | ||||||
|                         help="Number of edits to check when looking for reverts (default: 15)") |                         help="Number of edits to check when looking for reverts (default: 15)") | ||||||
| 
 | 
 | ||||||
| parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', |     parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, | ||||||
|  |                         action='append', | ||||||
|                         help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") |                         help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, |     parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, | ||||||
| @ -859,3 +863,7 @@ else: | |||||||
| 
 | 
 | ||||||
|     # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" |     # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" | ||||||
|     # stop_words = stop_words.split(",") |     # stop_words = stop_words.split(",") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user