a number of small updates and fixes

- fix regex for filename/filetype matches
- unload all files not just ones with end with xml in 7z archives
- fix bug that broke stdout
- minor cosmetic fixes
- updated mediawiki-utilities submodule to latest version
This commit is contained in:
Benjamin Mako Hill 2018-05-17 14:37:20 -07:00
parent 3f9da40747
commit ba886ecf4c
2 changed files with 15 additions and 14 deletions

@ -1 +1 @@
Subproject commit beba46e3eee8e0582cc3a5515dfa658ffbd18f9d Subproject commit f7329417ebb2f03d1e9b8a626236a3c0ce65c814

19
wikiq
View File

@ -272,11 +272,11 @@ class WikiqParser():
def open_input_file(input_filename): def open_input_file(input_filename):
if re.match(r'.*\.7z', input_filename): if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, '*.xml'] cmd = ["7za", "x", "-so", input_filename, '*']
elif re.match(r'.*\.gz', input_filename): elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename] cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2', input_filename): elif re.match(r'.*\.bz2$', input_filename):
cmd = ["zcat", input_filename] cmd = ["zcat", input_filename]
try: try:
@ -322,15 +322,17 @@ if len(args.dumpfiles) > 0:
for filename in args.dumpfiles: for filename in args.dumpfiles:
input_file = open_input_file(filename) input_file = open_input_file(filename)
# open file for output # open directory for output
if args.stdout:
output_file = sys.stdout
else:
if args.output_dir: if args.output_dir:
output_dir = args.output_dir[0] output_dir = args.output_dir[0]
else: else:
output_dir = "." output_dir = "."
print("Processing file: %s" % filename, file=sys.stderr)
if args.stdout:
output_file = sys.stdout
else:
filename = os.path.join(output_dir, os.path.basename(filename)) filename = os.path.join(output_dir, os.path.basename(filename))
output_file = open_output_file(filename) output_file = open_output_file(filename)
@ -339,7 +341,6 @@ if len(args.dumpfiles) > 0:
persist=args.persist, persist=args.persist,
urlencode=args.urlencode) urlencode=args.urlencode)
print("Processing file: %s" % filename, file=sys.stderr)
wikiq.process() wikiq.process()