added counting functionality to regex code

The regex code has historically returned the actual matched patterns and the
named capture groups within regexes.  When trying to count common and/or large
patterns, this leads to very large outputs.

I've added two new functions -RPc and -CPc that will cause wikiq to return
counts of each pattern (0 when there are no matches). The options apply to all
comment or revision patterns. I considered interfaces to make it possible to do
some but others but concluded this would be too complicated an interface.

This code should be checked before it's merged.
This commit is contained in:
Benjamin Mako Hill 2023-04-29 11:40:03 -07:00
parent 4729371d5a
commit 2ff4d60613

52
wikiq
View File

@ -139,8 +139,7 @@ class RegexPair(object):
def _make_key(self, cap_group): def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group)) return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data): def matchmake(self, content, rev_data, count_only=False):
temp_dict = {} temp_dict = {}
# if there are named capture groups in the regex # if there are named capture groups in the regex
if self.has_groups: if self.has_groups:
@ -163,29 +162,41 @@ class RegexPair(object):
temp_dict[key] = None temp_dict[key] = None
# else we put in the list we made in the for-loop above # else we put in the list we made in the for-loop above
else: else:
temp_dict[key] = ', '.join(temp_list) if count_only:
temp_dict[key] = len(temp_list)
else:
temp_dict[key] = ', '.join(temp_list)
# there are no matches at all in this revision content, we default values to None # there are no matches at all in this revision content, we default values to None
else: else:
for cap_group in self.capture_groups: for cap_group in self.capture_groups:
key = self._make_key(cap_group) key = self._make_key(cap_group)
temp_dict[key] = None if count_only:
temp_dict[key] = 0
else:
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex # there are no capture groups, we just search for all the matches of the regex
else: else:
#given that there are matches to be made #given that there are matches to be made
if content is not None and self.pattern.search(content) is not None: if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content) m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m) if count_only:
temp_dict[self.label] = len(m)
else:
temp_dict[self.label] = ', '.join(m)
else: else:
temp_dict[self.label] = None if count_only:
temp_dict[self.label] = 0
else:
temp_dict[self.label] = None
# update rev_data with our new columns # update rev_data with our new columns
rev_data.update(temp_dict) rev_data.update(temp_dict)
return rev_data return rev_data
class WikiqParser(): class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
""" """
Parameters: Parameters:
persist : what persistence method to use. Takes a PersistMethod value persist : what persistence method to use. Takes a PersistMethod value
@ -205,8 +216,10 @@ class WikiqParser():
self.namespace_filter = None self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_revision_output_count = regex_revision_output_count
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels): def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \ if (patterns is not None and labels is not None) and \
@ -223,14 +236,14 @@ class WikiqParser():
return rev_data return rev_data
def matchmake_revision(self, text, rev_data): def matchmake_revision(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
def matchmake_comment(self, comment, rev_data): def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs) return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
def matchmake_pairs(self, text, rev_data, pairs): def matchmake_pairs(self, text, rev_data, pairs, count_only):
for pair in pairs: for pair in pairs:
rev_data = pair.matchmake(text, rev_data) rev_data = pair.matchmake(text, rev_data, count_only)
return rev_data return rev_data
def __get_namespace_from_title(self, title): def __get_namespace_from_title(self, title):
@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.") help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.") help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.") help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
args = parser.parse_args() args = parser.parse_args()
# set persistence method # set persistence method
@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0:
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision, regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label, regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment, regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label) regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process() wikiq.process()
@ -566,8 +587,11 @@ else:
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision, regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label, regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment, regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label) regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process() wikiq.process()