From 3d0bf89938705b2b474f3ebb737ce97f156a2328 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 27 May 2025 11:10:42 -0500 Subject: [PATCH] Move main logic to main() This avoids: 1) the main function running when sourcing the file 2) Creating many globally-scoped variables in the main logic Also begin refactor of test output file logic Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 45 ++++----- wikiq | 200 +++++++++++++++++++++------------------- 2 files changed, 128 insertions(+), 117 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index d78ed32..be4777f 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -22,9 +22,6 @@ tracemalloc.start() class Test_Wikipedia(unittest.TestCase): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'ikwiki-20180301-pages-meta-history' self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") @@ -109,9 +106,6 @@ class Test_Wikipedia(unittest.TestCase): class Test_Basic(unittest.TestCase): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'sailormoon' self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") @@ -318,14 +312,14 @@ class Test_Regex(unittest.TestCase): self.test_output_dir = os.path.join(".", "test_output") self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - # we have two base calls, one for checking inputs and the other for checking outputs + # we have two base calls, one for checking arguments and the other for checking outputs self.base_call = "../wikiq {0}" self.base_call_outs = "../wikiq {0} -o {1}" self.baseline_output_dir = "baseline_output" - # sample inputs for checking that bad inputs get terminated / test_regex_inputs - self.bad_inputs_list = [ + # sample arguments for checking that bad arguments get terminated / test_regex_arguments + self.bad_arguments_list = [ # label is missing "-RP '\\b\\d+\\b'", # number of reg and number of labels do not match @@ -337,33 +331,33 @@ class Test_Regex(unittest.TestCase): "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] - # sample inputs for checking the outcomes of good inputs / test_basic_regex - self.good_inputs_list = [ + # sample arguments for checking the outcomes of good arguments / test_basic_regex + self.good_arguments_list = [ "-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", "-CP 'WP:EVADE' -CPl wp_evade" ] - self.cap_inputs_list = [ + self.cap_arguments_list = [ "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" ] - def test_regex_inputs(self): - for input in self.bad_inputs_list: + def test_regex_arguments(self): + for arguments in self.bad_arguments_list: call = self.base_call.format(self.input_file) - call = call + " --stdout " + input + call = call + " --stdout " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: stdout, stderr = proc.communicate() - # we want to check that the bad inputs were caught and sys.exit is stopping the code + # we want to check that the bad arguments were caught and sys.exit is stopping the code print(stderr.decode("utf-8")) self.assertNotEqual(proc.returncode, 0) def test_basic_regex(self): - for i, input in enumerate(self.good_inputs_list): + for i, arguments in enumerate(self.good_arguments_list): test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) # print(test_filename) @@ -372,7 +366,7 @@ class Test_Regex(unittest.TestCase): os.remove(test_file) call = self.base_call_outs.format(self.input_file, self.test_output_dir) - call = call + " " + input + call = call + " " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() @@ -388,7 +382,7 @@ class Test_Regex(unittest.TestCase): print(i) def test_capturegroup_regex(self): - for i, input in enumerate(self.cap_inputs_list): + for i, arguments in enumerate(self.cap_arguments_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) print(test_filename) test_file = os.path.join(self.test_output_dir, test_filename) @@ -396,10 +390,9 @@ class Test_Regex(unittest.TestCase): os.remove(test_file) call = self.base_call_outs.format(self.input_file, self.test_output_dir) - call = call + " " + input + call = call + " " + arguments print(call) - print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() assert (proc.returncode == 0) @@ -414,4 +407,14 @@ class Test_Regex(unittest.TestCase): if __name__ == '__main__': + # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. + if not os.path.exists("test_output"): + os.mkdir("test_output") + else: + # Avoid subsequent calls to tests interfering with each other. + # Otherwise, a test may erroneously pass if the program has no output + # but a previous run output what was expected. + for f in os.listdir("test_output"): + os.remove(os.path.join("test_output", f)) + unittest.main() diff --git a/wikiq b/wikiq index 3c41f1d..7553d8c 100755 --- a/wikiq +++ b/wikiq @@ -44,7 +44,7 @@ def calculate_persistence(tokens_added): len(tokens_added)) -class WikiqIterator(): +class WikiqIterator: def __init__(self, fh, collapse_user=False): self.fh = fh self.collapse_user = collapse_user @@ -66,7 +66,7 @@ class WikiqIterator(): return next(self.__pages) -class WikiqPage(): +class WikiqPage: __slots__ = ('id', 'title', 'namespace', 'redirect', 'restrictions', 'mwpage', '__revisions', 'collapse_user') @@ -164,7 +164,7 @@ class RegexPair(object): return fields def _make_key(self, cap_group): - return ("{}_{}".format(self.label, cap_group)) + return "{}_{}".format(self.label, cap_group) def matchmake(self, content, rev_data): @@ -182,7 +182,7 @@ class RegexPair(object): temp_list = [] for match in matchobjects: # we only want to add the match for the capture group if the match is not None - if match.group(cap_group) != None: + if match.group(cap_group) is not None: temp_list.append(match.group(cap_group)) # if temp_list of matches is empty just make that column None @@ -234,7 +234,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq @dataclass() -class RevDataBase(): +class RevDataBase: revid: int date_time: datetime articleid: int @@ -358,7 +358,7 @@ class RevDataPersistence(RevDataBase): """ -class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields. +class RevDataCollapsePersistence uses multiple inheritance to make a class that has both persistence and collapse fields. """ @@ -446,7 +446,7 @@ class WikiqParser: result.append(rp) self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() return result - elif (patterns is None and labels is None): + elif (patterns is None) and (labels is None): return [] else: sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') @@ -580,7 +580,7 @@ class WikiqParser: # TODO missing: additions_size deletions_size - # if collapse user was on, lets run that + # if collapse user was on, let's run that if self.collapse_user: rev_data.collapsed_revs = rev.collapsed_revs @@ -704,6 +704,7 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) + def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, "*.xml"] @@ -719,6 +720,7 @@ def open_input_file(input_filename): except NameError: return open(input_filename, 'r') + def get_output_filename(input_filename, parquet=False): output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) output_filename = re.sub(r'\.xml', '', output_filename) @@ -736,126 +738,132 @@ def open_output_file(input_filename): return output_file -parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') +def main(): + parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimited data.') -# arguments for the input direction -parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, - help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") + # arguments for the input direction + parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, + help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") -parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, - help="Directory for output files. If it ends with .parquet output will be in parquet format.") + parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, + help="Directory for output files. If it ends with .parquet output will be in parquet format.") -parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", - help="Write output to standard out (do not create dump file)") + parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", + help="Write output to standard out (do not create dump file)") -parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", - help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") + parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", + help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") -parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, - choices=['', 'segment', 'sequence', 'legacy'], nargs='?', - help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") + parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, + choices=['', 'segment', 'sequence', 'legacy'], nargs='?', + help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") -parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", - help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") + parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", + help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") -parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', - help="Id number of namspace to include. Can be specified more than once.") + parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', + help="Id number of namespace to include. Can be specified more than once.") -parser.add_argument('-rr', - '--revert-radius', - dest="revert_radius", - type=int, - action='store', - default=15, - help="Number of edits to check when looking for reverts (default: 15)") + parser.add_argument('-rr', + '--revert-radius', + dest="revert_radius", + type=int, + action='store', + default=15, + help="Number of edits to check when looking for reverts (default: 15)") -parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', - help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") + parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, + action='append', + help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") -parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, - action='append', - help="The label for the outputted column based on matching the regex in revision text.") + parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, + action='append', + help="The label for the outputted column based on matching the regex in revision text.") -parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', - help="The regular expression to search for in comments of revisions.") + parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', + help="The regular expression to search for in comments of revisions.") -parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, - action='append', - help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, + action='append', + help="The label for the outputted column based on matching the regex in comments.") -args = parser.parse_args() + args = parser.parse_args() -# set persistence method + # set persistence method -if args.persist is None: - persist = PersistMethod.none -elif args.persist == "segment": - persist = PersistMethod.segment -elif args.persist == "legacy": - persist = PersistMethod.legacy -else: - persist = PersistMethod.sequence + if args.persist is None: + persist = PersistMethod.none + elif args.persist == "segment": + persist = PersistMethod.segment + elif args.persist == "legacy": + persist = PersistMethod.legacy + else: + persist = PersistMethod.sequence -if args.namespace_filter is not None: - namespaces = args.namespace_filter -else: - namespaces = None + if args.namespace_filter is not None: + namespaces = args.namespace_filter + else: + namespaces = None -if len(args.dumpfiles) > 0: - output_parquet = False - for filename in args.dumpfiles: - input_file = open_input_file(filename) + if len(args.dumpfiles) > 0: + output_parquet = False + for filename in args.dumpfiles: + input_file = open_input_file(filename) - # open directory for output - if args.output_dir: - output_dir = args.output_dir[0] - else: - output_dir = "." + # open directory for output + if args.output_dir: + output_dir = args.output_dir[0] + else: + output_dir = "." - if output_dir.endswith(".parquet"): - output_parquet = True + if output_dir.endswith(".parquet"): + output_parquet = True - print("Processing file: %s" % filename, file=sys.stderr) + print("Processing file: %s" % filename, file=sys.stderr) - if args.stdout: - output_file = sys.stdout - else: - filename = os.path.join(output_dir, os.path.basename(filename)) - output_file = get_output_filename(filename, parquet=output_parquet) + if args.stdout: + output_file = sys.stdout + else: + filename = os.path.join(output_dir, os.path.basename(filename)) + output_file = get_output_filename(filename, parquet=output_parquet) - wikiq = WikiqParser(input_file, - output_file, + wikiq = WikiqParser(input_file, + output_file, + collapse_user=args.collapse_user, + persist=persist, + urlencode=args.urlencode, + namespaces=namespaces, + revert_radius=args.revert_radius, + regex_match_revision=args.regex_match_revision, + regex_revision_label=args.regex_revision_label, + regex_match_comment=args.regex_match_comment, + regex_comment_label=args.regex_comment_label, + output_parquet=output_parquet) + + wikiq.process() + + # close things + input_file.close() + + else: + wikiq = WikiqParser(sys.stdin, + sys.stdout, collapse_user=args.collapse_user, persist=persist, + # persist_legacy=args.persist_legacy, urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, regex_match_revision=args.regex_match_revision, regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, - regex_comment_label=args.regex_comment_label, - output_parquet=output_parquet) + regex_comment_label=args.regex_comment_label) wikiq.process() - # close things - input_file.close() + # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" + # stop_words = stop_words.split(",") -else: - wikiq = WikiqParser(sys.stdin, - sys.stdout, - collapse_user=args.collapse_user, - persist=persist, - # persist_legacy=args.persist_legacy, - urlencode=args.urlencode, - namespaces=namespaces, - revert_radius=args.revert_radius, - regex_match_revision=args.regex_match_revision, - regex_revision_label=args.regex_revision_label, - regex_match_comment=args.regex_match_comment, - regex_comment_label=args.regex_comment_label) - wikiq.process() - -# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" -# stop_words = stop_words.split(",") +if __name__ == "__main__": + main()