4 changed files with 38 additions and 76 deletions
--- a/README.rst
+++ b/README.rst
@ -9,11 +9,5 @@ submodule like::
  git submodule update
-Wikimedia dumps are usually in a compressed format such as 7z (most common),
+Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
-gz, or bz2. Wikiq uses your computer's compression software to read these
+`7za`, `gzcat`, and `zcat`. 
 files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`. 
 There are also a series of Python dependencies. You can install these using pip
 with a command like:
  pip3 install mwbase mwreverts mwxml mwtypes mwcli mwdiffs mwpersistence pandas
--- a/code_review_notes.txt
+++ b/code_review_notes.txt
@ -1,9 +0,0 @@
 Please add unit tests for the new count-only functionality.
 line 43 def matchmake:
     This was making redundant calls to regex matching functions and so could be slower than necessary. I suggest changes that use the walrus operator to keep the same logical structure without the redundant calls. 
 line 212 def __init__:
     Minor note: This constructor is taking a lot of arguments.  This is fine, but from a style + maintainability perspective it might make sense to create a new class for the regex matching configuration and pass a configuration object to this contructor instead.
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@ -3,7 +3,7 @@ import os
 import subprocess
 from shutil import copyfile
 import pandas as pd
-from pandas.testing import assert_frame_equal
+from pandas.util.testing import assert_frame_equal
 from io import StringIO
 # with / without pwr DONE
--- a/93
+++ b/93
@ -139,61 +139,53 @@ class RegexPair(object):
    def _make_key(self, cap_group):
        return ("{}_{}".format(self.label, cap_group))
-    def matchmake(self, content, rev_data, count_only=False):
+    def matchmake(self, content, rev_data):
        temp_dict = {}
        # if there are named capture groups in the regex
        if self.has_groups:
            # if there are matches of some sort in this revision content, fill the lists for each cap_group
-            if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0:
+            if self.pattern.search(content) is not None:
                m = self.pattern.finditer(content)
                matchobjects = list(m)
                for cap_group in self.capture_groups:
                    key = self._make_key(cap_group)
                    temp_list = []
                    for match in matchobjects:
                        # we only want to add the match for the capture group if the match is not None
-                        if (group := match.group(cap_group)) is not None:
+                        if match.group(cap_group) != None:
-                            temp_list.append(group)
+                            temp_list.append(match.group(cap_group))
-                        # if temp_list of matches is empty just make that column None
+                    # if temp_list of matches is empty just make that column None
-                        if len(temp_list)==0:
+                    if len(temp_list)==0:
-                            temp_dict[key] = None
+                        temp_dict[key] = None
-                            # else we put in the list we made in the for-loop above
+                    # else we put in the list we made in the for-loop above
-                        else:
+                    else:
-                            if count_only:
+                        temp_dict[key] = ', '.join(temp_list)
                                temp_dict[key] = len(temp_list)
                            else:
                                temp_dict[key] = ', '.join(temp_list)
-                # there are no matches at all in this revision content, we default values to None
+            # there are no matches at all in this revision content, we default values to None
            else:
                for cap_group in self.capture_groups:
                    key = self._make_key(cap_group)
-                    if count_only:
+                    temp_dict[key] = None
                        temp_dict[key] = 0
                    else:
                        temp_dict[key] = None
        # there are no capture groups, we just search for all the matches of the regex
        else:
            #given that there are matches to be made
-            if content is not None and self.pattern.search(content) is not None:
+            if self.pattern.search(content) is not None:
                m = self.pattern.findall(content)
-                if count_only:
+                temp_dict[self.label] = ', '.join(m)
                    temp_dict[self.label] = len(m)
                else:
                    temp_dict[self.label] = ', '.join(m)
            else:
-                if count_only:
+                temp_dict[self.label] = None    
                    temp_dict[self.label] = 0
                else:
                    temp_dict[self.label] = None
        # update rev_data with our new columns
        rev_data.update(temp_dict)
        return rev_data
 class WikiqParser():
-    def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
+    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
        """ 
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
@ -212,11 +204,9 @@ class WikiqParser():
        else:
            self.namespace_filter = None
-        self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label)
+        self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
-        self.regex_revision_output_count = regex_revision_output_count
+        self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
-
+        
        self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label)
        self.regex_comment_output_count = regex_comment_output_count
    def make_matchmake_pairs(self, patterns, labels):
        if (patterns is not None and labels is not None) and \
@ -233,14 +223,14 @@ class WikiqParser():
        return rev_data
    def matchmake_revision(self, text, rev_data):
-        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
+        return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
    def matchmake_comment(self, comment, rev_data):
-        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
+        return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
-    def matchmake_pairs(self, text, rev_data, pairs, count_only):
+    def matchmake_pairs(self, text, rev_data, pairs):
        for pair in pairs:
-            rev_data = pair.matchmake(text, rev_data, count_only)
+            rev_data = pair.matchmake(text, rev_data)
        return rev_data
    def __get_namespace_from_title(self, title):
@ -377,8 +367,6 @@ class WikiqParser():
                    rev_data['collapsed_revs'] = rev.collapsed_revs
                if self.persist != PersistMethod.none:
                    # initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted                    
                    old_rev_data = {}
                    if rev.deleted.text:
                        for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
                            old_rev_data[k] = None
@ -500,24 +488,18 @@ parser.add_argument('-rr',
                    default=15,
                    help="Number of edits to check when looking for reverts (default: 15)")
-parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append',
+parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
                    help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
 parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
                    help="The label for the outputted column based on matching the regex in revision text.")
-parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
+parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
                    help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves.  It will affect all revision patterns.")
 parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append',
                    help="The regular expression to search for in comments of revisions.")
 parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
                    help="The label for the outputted column based on matching the regex in comments.")
 parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
                    help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
 args = parser.parse_args()
 # set persistence method
@ -561,12 +543,10 @@ if len(args.dumpfiles) > 0:
                            urlencode=args.urlencode,
                            namespaces=namespaces,
                            revert_radius=args.revert_radius,
-                            regex_revision_match = args.regex_revision_match,
+                            regex_match_revision = args.regex_match_revision,
                            regex_revision_label = args.regex_revision_label,
-                            regex_revision_output_count = args.regex_revision_output_count,
+                            regex_match_comment = args.regex_match_comment,
-                            regex_comment_match = args.regex_comment_match,
+                            regex_comment_label = args.regex_comment_label)
                            regex_comment_label = args.regex_comment_label,
                            regex_comment_output_count = args.regex_comment_output_count)
        wikiq.process()
@ -582,13 +562,10 @@ else:
                        urlencode=args.urlencode,
                        namespaces=namespaces,
                        revert_radius=args.revert_radius,
-                        regex_revision_match = args.regex_revision_match,
+                        regex_match_revision = args.regex_match_revision,
                        regex_revision_label = args.regex_revision_label,
-                        regex_revision_output_count = args.regex_revision_output_count,
+                        regex_match_comment = args.regex_match_comment,
-                        regex_comment_match = args.regex_comment_match,
+                        regex_comment_label = args.regex_comment_label)
                        regex_comment_label = args.regex_comment_label,
                        regex_comment_output_count = args.regex_comment_output_count)
    wikiq.process()