added counting functionality to regex code
The regex code has historically returned the actual matched patterns and the named capture groups within regexes. When trying to count common and/or large patterns, this leads to very large outputs. I've added two new functions -RPc and -CPc that will cause wikiq to return counts of each pattern (0 when there are no matches). The options apply to all comment or revision patterns. I considered interfaces to make it possible to do some but others but concluded this would be too complicated an interface. This code should be checked before it's merged.
This commit is contained in:
parent
4729371d5a
commit
2ff4d60613
52
wikiq
52
wikiq
@ -139,8 +139,7 @@ class RegexPair(object):
|
|||||||
def _make_key(self, cap_group):
|
def _make_key(self, cap_group):
|
||||||
return ("{}_{}".format(self.label, cap_group))
|
return ("{}_{}".format(self.label, cap_group))
|
||||||
|
|
||||||
def matchmake(self, content, rev_data):
|
def matchmake(self, content, rev_data, count_only=False):
|
||||||
|
|
||||||
temp_dict = {}
|
temp_dict = {}
|
||||||
# if there are named capture groups in the regex
|
# if there are named capture groups in the regex
|
||||||
if self.has_groups:
|
if self.has_groups:
|
||||||
@ -163,29 +162,41 @@ class RegexPair(object):
|
|||||||
temp_dict[key] = None
|
temp_dict[key] = None
|
||||||
# else we put in the list we made in the for-loop above
|
# else we put in the list we made in the for-loop above
|
||||||
else:
|
else:
|
||||||
temp_dict[key] = ', '.join(temp_list)
|
if count_only:
|
||||||
|
temp_dict[key] = len(temp_list)
|
||||||
|
else:
|
||||||
|
temp_dict[key] = ', '.join(temp_list)
|
||||||
|
|
||||||
# there are no matches at all in this revision content, we default values to None
|
# there are no matches at all in this revision content, we default values to None
|
||||||
else:
|
else:
|
||||||
for cap_group in self.capture_groups:
|
for cap_group in self.capture_groups:
|
||||||
key = self._make_key(cap_group)
|
key = self._make_key(cap_group)
|
||||||
temp_dict[key] = None
|
if count_only:
|
||||||
|
temp_dict[key] = 0
|
||||||
|
else:
|
||||||
|
temp_dict[key] = None
|
||||||
|
|
||||||
# there are no capture groups, we just search for all the matches of the regex
|
# there are no capture groups, we just search for all the matches of the regex
|
||||||
else:
|
else:
|
||||||
#given that there are matches to be made
|
#given that there are matches to be made
|
||||||
if content is not None and self.pattern.search(content) is not None:
|
if content is not None and self.pattern.search(content) is not None:
|
||||||
m = self.pattern.findall(content)
|
m = self.pattern.findall(content)
|
||||||
temp_dict[self.label] = ', '.join(m)
|
if count_only:
|
||||||
|
temp_dict[self.label] = len(m)
|
||||||
|
else:
|
||||||
|
temp_dict[self.label] = ', '.join(m)
|
||||||
else:
|
else:
|
||||||
temp_dict[self.label] = None
|
if count_only:
|
||||||
|
temp_dict[self.label] = 0
|
||||||
|
else:
|
||||||
|
temp_dict[self.label] = None
|
||||||
# update rev_data with our new columns
|
# update rev_data with our new columns
|
||||||
rev_data.update(temp_dict)
|
rev_data.update(temp_dict)
|
||||||
return rev_data
|
return rev_data
|
||||||
|
|
||||||
|
|
||||||
class WikiqParser():
|
class WikiqParser():
|
||||||
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
|
def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
|
||||||
"""
|
"""
|
||||||
Parameters:
|
Parameters:
|
||||||
persist : what persistence method to use. Takes a PersistMethod value
|
persist : what persistence method to use. Takes a PersistMethod value
|
||||||
@ -205,8 +216,10 @@ class WikiqParser():
|
|||||||
self.namespace_filter = None
|
self.namespace_filter = None
|
||||||
|
|
||||||
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
|
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
|
||||||
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
self.regex_revision_output_count = regex_revision_output_count
|
||||||
|
|
||||||
|
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
||||||
|
self.regex_comment_output_count = regex_comment_output_count
|
||||||
|
|
||||||
def make_matchmake_pairs(self, patterns, labels):
|
def make_matchmake_pairs(self, patterns, labels):
|
||||||
if (patterns is not None and labels is not None) and \
|
if (patterns is not None and labels is not None) and \
|
||||||
@ -223,14 +236,14 @@ class WikiqParser():
|
|||||||
return rev_data
|
return rev_data
|
||||||
|
|
||||||
def matchmake_revision(self, text, rev_data):
|
def matchmake_revision(self, text, rev_data):
|
||||||
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
|
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
|
||||||
|
|
||||||
def matchmake_comment(self, comment, rev_data):
|
def matchmake_comment(self, comment, rev_data):
|
||||||
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
|
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
|
||||||
|
|
||||||
def matchmake_pairs(self, text, rev_data, pairs):
|
def matchmake_pairs(self, text, rev_data, pairs, count_only):
|
||||||
for pair in pairs:
|
for pair in pairs:
|
||||||
rev_data = pair.matchmake(text, rev_data)
|
rev_data = pair.matchmake(text, rev_data, count_only)
|
||||||
return rev_data
|
return rev_data
|
||||||
|
|
||||||
def __get_namespace_from_title(self, title):
|
def __get_namespace_from_title(self, title):
|
||||||
@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de
|
|||||||
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
|
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
|
||||||
help="The label for the outputted column based on matching the regex in revision text.")
|
help="The label for the outputted column based on matching the regex in revision text.")
|
||||||
|
|
||||||
|
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
|
||||||
|
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
|
||||||
|
|
||||||
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
|
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
|
||||||
help="The regular expression to search for in comments of revisions.")
|
help="The regular expression to search for in comments of revisions.")
|
||||||
|
|
||||||
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
|
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
|
||||||
help="The label for the outputted column based on matching the regex in comments.")
|
help="The label for the outputted column based on matching the regex in comments.")
|
||||||
|
|
||||||
|
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
|
||||||
|
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# set persistence method
|
# set persistence method
|
||||||
@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0:
|
|||||||
revert_radius=args.revert_radius,
|
revert_radius=args.revert_radius,
|
||||||
regex_match_revision = args.regex_match_revision,
|
regex_match_revision = args.regex_match_revision,
|
||||||
regex_revision_label = args.regex_revision_label,
|
regex_revision_label = args.regex_revision_label,
|
||||||
|
regex_revision_output_count = args.regex_revision_output_count,
|
||||||
regex_match_comment = args.regex_match_comment,
|
regex_match_comment = args.regex_match_comment,
|
||||||
regex_comment_label = args.regex_comment_label)
|
regex_comment_label = args.regex_comment_label,
|
||||||
|
regex_comment_output_count = args.regex_comment_output_count)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
|
||||||
@ -566,8 +587,11 @@ else:
|
|||||||
revert_radius=args.revert_radius,
|
revert_radius=args.revert_radius,
|
||||||
regex_match_revision = args.regex_match_revision,
|
regex_match_revision = args.regex_match_revision,
|
||||||
regex_revision_label = args.regex_revision_label,
|
regex_revision_label = args.regex_revision_label,
|
||||||
|
regex_revision_output_count = args.regex_revision_output_count,
|
||||||
regex_match_comment = args.regex_match_comment,
|
regex_match_comment = args.regex_match_comment,
|
||||||
regex_comment_label = args.regex_comment_label)
|
regex_comment_label = args.regex_comment_label,
|
||||||
|
regex_comment_output_count = args.regex_comment_output_count)
|
||||||
|
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user