Remove unused dependencies and fix spacing

The "mw" and "numpy" dependencies were unneeded. Spaces and tabs were inconsistently used. They are now used consistently, changes via auto-formatter. Signed-off-by: Will Beason <willbeason@gmail.com>
2025-05-26 14:15:01 -05:00 · 2025-05-26 14:15:01 -05:00 · 9c5bf577e6
commit 9c5bf577e6
parent 4804ecc4b3
2 changed files with 134 additions and 113 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
-apeek==0.1.1
 attrs==25.3.0
 certifi==2025.4.26
 charset-normalizer==3.4.2
@ -11,14 +10,12 @@ jsonable==0.3.1
 jsonschema==4.23.0
 jsonschema-specifications==2025.4.1
 mediawiki-utilities==0.4.18
-mw==0.4.0
 mwcli==0.0.3
 mwdiffs==0.0.2
 mwpersistence==0.2.4
 mwreverts==0.1.5
 mwtypes==0.4.0
 mwxml==0.3.6
-numpy==1.26.4
 pandas==2.2.3
 para==0.0.8
 parsimonious==0.10.0
--- a/56
+++ b/56
@ -6,7 +6,7 @@

 import argparse
 import sys
-import os, os.path
+import os.path
 import re
 from datetime import datetime, timezone

@ -20,6 +20,7 @@ from deltas.tokenizers import wikitext_split
 import mwpersistence
 import mwreverts
 from urllib.parse import quote
+
 TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS = 7
 from deltas import SequenceMatcher
@ -30,16 +31,19 @@ from dataclasses import dataclass
 import pyarrow as pa
 import pyarrow.parquet as pq

+
 class PersistMethod:
    none = 0
    sequence = 1
    segment = 2
    legacy = 3

+
 def calculate_persistence(tokens_added):
    return (sum([(len(x.revisions) - 1) for x in tokens_added]),
            len(tokens_added))

+
 class WikiqIterator():
    def __init__(self, fh, collapse_user=False):
        self.fh = fh
@ -59,7 +63,8 @@ class WikiqIterator():
        return self.__pages

    def __next__(self):
-        return next(self._pages)
+        return next(self.__pages)
+

 class WikiqPage():
    __slots__ = ('id', 'title', 'namespace', 'redirect',
@ -138,6 +143,8 @@ A RegexPair is defined by a regular expression (pattern) and a label.
 The pattern can include capture groups.  If it does then each capture group will have a resulting column in the output.
 If the pattern does not include a capture group, then only one output column will result.
 """
+
+
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@ -206,6 +213,7 @@ class RegexPair(object):

        return rev_data

+
 """

 We used to use a dictionary to collect fields for the output. 
@ -222,6 +230,8 @@ It also needs to have the correct pyarrow schema so we can write parquet files.

 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
 """
+
+
@dataclass()
 class RevDataBase():
    revid: int
@ -304,6 +314,7 @@ class RevDataBase():
    def header_row(self):
        return '\t'.join(map(lambda f: f.name, dc.fields(self)))

+
 """

 If collapse=True we'll use a RevDataCollapse dataclass.
@ -312,6 +323,8 @@ This class inherits from RevDataBase. This means that it has all the same fields
 It just adds a new field and updates the pyarrow schema.

 """
+
+
@dataclass()
 class RevDataCollapse(RevDataBase):
    collapsed_revs: int = None
@ -319,11 +332,14 @@ class RevDataCollapse(RevDataBase):
    pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
    pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]

+
 """

 If persistence data is to be computed we'll need the fields added by RevDataPersistence. 

 """
+
+
@dataclass()
 class RevDataPersistence(RevDataBase):
    token_revs: int = None
@ -339,16 +355,22 @@ class RevDataPersistence(RevDataBase):

    pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields

+
 """
 class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.

 """
+
+
@dataclass()
 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
    pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields

-class WikiqParser():
-    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
+
+class WikiqParser:
+    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label,
+                 regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces=None,
+                 revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
        """ 
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
@ -370,7 +392,6 @@ class WikiqParser():
        self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
        self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)

-
        # This is where we set the type for revdata.

        if self.collapse_user is True:
@ -450,7 +471,7 @@ class WikiqParser():

        for ns in self.namespaces:
            # skip if the namespace is not defined
-            if ns == None:
+            if ns is None:
                default_ns = self.namespaces[ns]
                continue

@ -460,7 +481,6 @@ class WikiqParser():
        # if we've made it this far with no matches, we return the default namespace
        return default_ns

-
    def process(self):

        # create a regex that creates the output filename
@ -477,7 +497,6 @@ class WikiqParser():
        page_count = 0
        rev_count = 0

-
        # Iterate through pages
        for page in dump:
            namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
@ -622,11 +641,11 @@ class WikiqParser():
        else:
            self.output_file.close()

-
    """
    For performance reasons it's better to write parquet in batches instead of one row at a time.
    So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
    """
+
    def write_parquet_row(self, rev_data):
        padata = rev_data.to_pyarrow()
        self.parquet_buffer.append(padata)
@ -634,16 +653,17 @@ class WikiqParser():
        if len(self.parquet_buffer) >= self.parquet_buffer_size:
            self.flush_parquet_buffer()

-
    """
    Function that actually writes data to the parquet file. 
    It needs to transpose the data from row-by-row to column-by-column
    """
+
    def flush_parquet_buffer(self):

        """
        Returns the pyarrow table that we'll write
        """
+
        def rows_to_table(rg, schema):
            cols = []
            first = rg[0]
@ -661,7 +681,7 @@ class WikiqParser():

        outtable = rows_to_table(self.parquet_buffer, self.schema)
        if self.pq_writer is None:
-            self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
+            self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')

        self.pq_writer.write_table(outtable)
        self.parquet_buffer = []
@ -699,6 +719,7 @@ def open_input_file(input_filename):

    return input_file

+
 def get_output_filename(input_filename, parquet=False):
    output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
    output_filename = re.sub(r'\.xml', '', output_filename)
@ -708,12 +729,14 @@ def get_output_filename(input_filename, parquet = False):
        output_filename = output_filename + ".parquet"
    return output_filename

+
 def open_output_file(input_filename):
    # create a regex that creates the output filename
    output_filename = get_output_filename(input_filename, parquet=False)
    output_file = open(output_filename, "w")
    return output_file

+
 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')

 # arguments for the input direction
@ -729,7 +752,8 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")

-parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
+parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
+                    choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")

 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
@ -749,19 +773,19 @@ parser.add_argument('-rr',
 parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
                    help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")

-parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
+parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str,
+                    action='append',
                    help="The label for the outputted column based on matching the regex in revision text.")

 parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
                    help="The regular expression to search for in comments of revisions.")

-parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
+parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str,
+                    action='append',
                    help="The label for the outputted column based on matching the regex in comments.")

 args = parser.parse_args()

-
-
 # set persistence method

 if args.persist is None: