Fix issue with .7z archives
Before, only fandom wikis dumps were compressed with .7z. These archives can have several .xml files in the .7z; not just one. So we need to have a flag for the fandom-2020 dumps. This fixes the bug so .7z archives work in either case.
This commit is contained in:
parent
39fec0820d
commit
2a2b611d79
@ -6,7 +6,7 @@ readme = "README.md"
|
|||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deltas>=0.7.0",
|
"deltas>=0.7.0",
|
||||||
"mw>=0.4.0",
|
"mediawiki-utilities>=0.4.18",
|
||||||
"mwpersistence>=0.2.4",
|
"mwpersistence>=0.2.4",
|
||||||
"mwreverts>=0.1.5",
|
"mwreverts>=0.1.5",
|
||||||
"mwxml>=0.3.6",
|
"mwxml>=0.3.6",
|
||||||
|
@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--collapse-user")
|
tester.call_wikiq("--collapse-user --fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence segment")
|
tester.call_wikiq("--persistence segment --fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence legacy")
|
tester.call_wikiq("--persistence legacy --fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence")
|
tester.call_wikiq("--persistence --fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--url-encode")
|
tester.call_wikiq("--url-encode --fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
|
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
|
outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
16
wikiq
16
wikiq
@ -704,17 +704,21 @@ class WikiqParser:
|
|||||||
line = rev_data.to_tsv_row()
|
line = rev_data.to_tsv_row()
|
||||||
print(line, file=self.output_file)
|
print(line, file=self.output_file)
|
||||||
|
|
||||||
|
def match_archive_suffix(input_filename):
|
||||||
def open_input_file(input_filename):
|
|
||||||
if re.match(r'.*\.7z$', input_filename):
|
if re.match(r'.*\.7z$', input_filename):
|
||||||
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
|
cmd = ["7za", "x", "-so", input_filename]
|
||||||
elif re.match(r'.*\.gz$', input_filename):
|
elif re.match(r'.*\.gz$', input_filename):
|
||||||
cmd = ["zcat", input_filename]
|
cmd = ["zcat", input_filename]
|
||||||
elif re.match(r'.*\.bz2$', input_filename):
|
elif re.match(r'.*\.bz2$', input_filename):
|
||||||
cmd = ["bzcat", "-dk", input_filename]
|
cmd = ["bzcat", "-dk", input_filename]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unrecognized file type: %s" % input_filename)
|
raise ValueError("Unrecognized file type: %s" % input_filename)
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
def open_input_file(input_filename, fandom_2020=False):
|
||||||
|
cmd = match_archive_suffix(input_filename)
|
||||||
|
if fandom_2020:
|
||||||
|
cmd.append("*.xml")
|
||||||
try:
|
try:
|
||||||
return Popen(cmd, stdout=PIPE).stdout
|
return Popen(cmd, stdout=PIPE).stdout
|
||||||
except NameError:
|
except NameError:
|
||||||
@ -787,6 +791,10 @@ def main():
|
|||||||
action='append',
|
action='append',
|
||||||
help="The label for the outputted column based on matching the regex in comments.")
|
help="The label for the outputted column based on matching the regex in comments.")
|
||||||
|
|
||||||
|
parser.add_argument('--fandom-2020', dest="fandom_2020",
|
||||||
|
action='store_true',
|
||||||
|
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# set persistence method
|
# set persistence method
|
||||||
@ -808,7 +816,7 @@ def main():
|
|||||||
if len(args.dumpfiles) > 0:
|
if len(args.dumpfiles) > 0:
|
||||||
output_parquet = False
|
output_parquet = False
|
||||||
for filename in args.dumpfiles:
|
for filename in args.dumpfiles:
|
||||||
input_file = open_input_file(filename)
|
input_file = open_input_file(filename, args.fandom_2020)
|
||||||
|
|
||||||
# open directory for output
|
# open directory for output
|
||||||
if args.output_dir:
|
if args.output_dir:
|
||||||
|
Loading…
Reference in New Issue
Block a user