diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 0cb78dc..d78ed32 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -44,8 +44,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -65,8 +67,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1" print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) + copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -87,8 +91,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1 -rr 1" print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) + copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -125,8 +131,10 @@ class Test_Basic(unittest.TestCase): os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -144,9 +152,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --collapse-user" - - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -163,8 +172,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence segment" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -182,8 +193,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence legacy" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -201,8 +214,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -223,8 +238,9 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -252,11 +268,13 @@ class Test_Malformed(unittest.TestCase): def test_malformed_noargs(self): call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() - outs, errs = proc.communicate() - errlines = str(errs).split("\\n") - self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + self.assertNotEqual(proc.returncode, 0) + outs, errs = proc.communicate() + errlines = str(errs).split("\\n") + self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') class Test_Stdout(unittest.TestCase): @@ -337,13 +355,12 @@ class Test_Regex(unittest.TestCase): call = self.base_call.format(self.input_file) call = call + " --stdout " + input print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - stdout, stderr = proc.communicate() - # print(proc.returncode) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + stdout, stderr = proc.communicate() + # we want to check that the bad inputs were caught and sys.exit is stopping the code + print(stderr.decode("utf-8")) - # we want to check that the bad inputs were caught and sys.exit is stopping the code - print(stderr.decode("utf-8")) - self.assertNotEqual(proc.returncode, 0) + self.assertNotEqual(proc.returncode, 0) def test_basic_regex(self): for i, input in enumerate(self.good_inputs_list): @@ -357,9 +374,10 @@ class Test_Regex(unittest.TestCase): call = self.base_call_outs.format(self.input_file, self.test_output_dir) call = call + " " + input print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() copyfile(self.call_output, test_file) test = pd.read_table(test_file) @@ -381,8 +399,10 @@ class Test_Regex(unittest.TestCase): call = call + " " + input print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) diff --git a/wikiq b/wikiq index 7b59720..3c41f1d 100755 --- a/wikiq +++ b/wikiq @@ -97,6 +97,7 @@ class WikiqPage(): # 3 A B True # 4 A A False # Post-loop A Always + collapsed_revs = 0 for i, rev in enumerate(self.mwpage): # never yield the first time if i == 0: @@ -491,7 +492,7 @@ class WikiqParser: # Construct dump file iterator dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) - # extract list of namspaces + # extract list of namespaces self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces} page_count = 0 @@ -703,7 +704,6 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) - def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, "*.xml"] @@ -711,14 +711,13 @@ def open_input_file(input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] + else: + raise ValueError("Unrecognized file type: %s" % input_filename) try: - input_file = Popen(cmd, stdout=PIPE).stdout + return Popen(cmd, stdout=PIPE).stdout except NameError: - input_file = open(input_filename, 'r') - - return input_file - + return open(input_filename, 'r') def get_output_filename(input_filename, parquet=False): output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)