write regex captures to parquet arrays.
This commit is contained in:
8
wikiq
8
wikiq
@@ -202,7 +202,7 @@ class RegexPair(object):
|
||||
if type(content) in(str, bytes):
|
||||
if self.pattern.search(content) is not None:
|
||||
m = self.pattern.findall(content)
|
||||
temp_dict[self.label] = ', '.join(m)
|
||||
temp_dict[self.label] = m
|
||||
else:
|
||||
temp_dict[self.label] = None
|
||||
|
||||
@@ -297,6 +297,9 @@ class RevDataBase():
|
||||
elif f.type == list[int]:
|
||||
row.append('"' + ",".join([str(x) for x in val]) + '"')
|
||||
|
||||
elif f.type == list[str]:
|
||||
row.append('"' + ",".join([(x) for x in val]) + '"')
|
||||
|
||||
elif f.type == str:
|
||||
if self.urlencode and f.name in TO_ENCODE:
|
||||
row.append(quote(str(val)))
|
||||
@@ -688,6 +691,7 @@ class WikiqParser():
|
||||
|
||||
# depending on if we are configured to write tsv or parquet, we'll call a different function.
|
||||
def print_rev_data(self, rev_data):
|
||||
|
||||
if self.output_parquet is False:
|
||||
printfunc = self.write_tsv_row
|
||||
else:
|
||||
@@ -840,7 +844,7 @@ if len(args.dumpfiles) > 0:
|
||||
filename = os.path.join(output_dir, os.path.basename(filename))
|
||||
output_file = get_output_filename(filename, parquet = output_parquet)
|
||||
|
||||
print(args.siteinfo)
|
||||
print(args.siteinfo, file=sys.stderr)
|
||||
wikiq = WikiqParser(input_file,
|
||||
output_file,
|
||||
collapse_user=args.collapse_user,
|
||||
|
||||
Reference in New Issue
Block a user