added counting functionality to regex code

The regex code has historically returned the actual matched patterns and the
named capture groups within regexes.  When trying to count common and/or large
patterns, this leads to very large outputs.

I've added two new functions -RPc and -CPc that will cause wikiq to return
counts of each pattern (0 when there are no matches). The options apply to all
comment or revision patterns. I considered interfaces to make it possible to do
some but others but concluded this would be too complicated an interface.

This code should be checked before it's merged.
This commit is contained in:
Benjamin Mako Hill 2023-04-29 11:40:03 -07:00
parent 4729371d5a
commit 2ff4d60613

52
wikiq
View File

@ -139,8 +139,7 @@ class RegexPair(object):
def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data):
def matchmake(self, content, rev_data, count_only=False):
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
@ -163,29 +162,41 @@ class RegexPair(object):
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
temp_dict[key] = ', '.join(temp_list)
if count_only:
temp_dict[key] = len(temp_list)
else:
temp_dict[key] = ', '.join(temp_list)
# there are no matches at all in this revision content, we default values to None
else:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
temp_dict[key] = None
if count_only:
temp_dict[key] = 0
else:
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex
else:
#given that there are matches to be made
if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m)
if count_only:
temp_dict[self.label] = len(m)
else:
temp_dict[self.label] = ', '.join(m)
else:
temp_dict[self.label] = None
if count_only:
temp_dict[self.label] = 0
else:
temp_dict[self.label] = None
# update rev_data with our new columns
rev_data.update(temp_dict)
return rev_data
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@ -205,8 +216,10 @@ class WikiqParser():
self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_revision_output_count = regex_revision_output_count
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \
@ -223,14 +236,14 @@ class WikiqParser():
return rev_data
def matchmake_revision(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
def matchmake_pairs(self, text, rev_data, pairs):
def matchmake_pairs(self, text, rev_data, pairs, count_only):
for pair in pairs:
rev_data = pair.matchmake(text, rev_data)
rev_data = pair.matchmake(text, rev_data, count_only)
return rev_data
def __get_namespace_from_title(self, title):
@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
args = parser.parse_args()
# set persistence method
@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0:
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()
@ -566,8 +587,11 @@ else:
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()