Fix issue with .7z archives
Before, only fandom wikis dumps were compressed with .7z. These archives can have several .xml files in the .7z; not just one. So we need to have a flag for the fandom-2020 dumps. This fixes the bug so .7z archives work in either case.
This commit is contained in:
parent
39fec0820d
commit
2a2b611d79
@ -6,7 +6,7 @@ readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"deltas>=0.7.0",
|
||||
"mw>=0.4.0",
|
||||
"mediawiki-utilities>=0.4.18",
|
||||
"mwpersistence>=0.2.4",
|
||||
"mwreverts>=0.1.5",
|
||||
"mwxml>=0.3.6",
|
||||
|
@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--collapse-user")
|
||||
tester.call_wikiq("--collapse-user --fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--persistence segment")
|
||||
tester.call_wikiq("--persistence segment --fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--persistence legacy")
|
||||
tester.call_wikiq("--persistence legacy --fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--persistence")
|
||||
tester.call_wikiq("--persistence --fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--url-encode")
|
||||
tester.call_wikiq("--url-encode --fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
|
||||
|
||||
try:
|
||||
outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
|
||||
outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
|
16
wikiq
16
wikiq
@ -704,17 +704,21 @@ class WikiqParser:
|
||||
line = rev_data.to_tsv_row()
|
||||
print(line, file=self.output_file)
|
||||
|
||||
|
||||
def open_input_file(input_filename):
|
||||
def match_archive_suffix(input_filename):
|
||||
if re.match(r'.*\.7z$', input_filename):
|
||||
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
|
||||
cmd = ["7za", "x", "-so", input_filename]
|
||||
elif re.match(r'.*\.gz$', input_filename):
|
||||
cmd = ["zcat", input_filename]
|
||||
elif re.match(r'.*\.bz2$', input_filename):
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
else:
|
||||
raise ValueError("Unrecognized file type: %s" % input_filename)
|
||||
return cmd
|
||||
|
||||
def open_input_file(input_filename, fandom_2020=False):
|
||||
cmd = match_archive_suffix(input_filename)
|
||||
if fandom_2020:
|
||||
cmd.append("*.xml")
|
||||
try:
|
||||
return Popen(cmd, stdout=PIPE).stdout
|
||||
except NameError:
|
||||
@ -787,6 +791,10 @@ def main():
|
||||
action='append',
|
||||
help="The label for the outputted column based on matching the regex in comments.")
|
||||
|
||||
parser.add_argument('--fandom-2020', dest="fandom_2020",
|
||||
action='store_true',
|
||||
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set persistence method
|
||||
@ -808,7 +816,7 @@ def main():
|
||||
if len(args.dumpfiles) > 0:
|
||||
output_parquet = False
|
||||
for filename in args.dumpfiles:
|
||||
input_file = open_input_file(filename)
|
||||
input_file = open_input_file(filename, args.fandom_2020)
|
||||
|
||||
# open directory for output
|
||||
if args.output_dir:
|
||||
|
Loading…
Reference in New Issue
Block a user