Remove unused code

This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
2025-06-03 17:20:05 -05:00 · 2025-06-03 17:20:05 -05:00 · 8c707f5ef3
commit 8c707f5ef3
parent b50c51a215
1 changed files with 8 additions and 96 deletions
--- a/104
+++ b/104
@ -8,14 +8,13 @@ import argparse
 import sys
 import os.path
 import re
 from datetime import datetime, timezone
 from io import TextIOWrapper
 from itertools import groupby
 from subprocess import Popen, PIPE
 from collections import deque
 from hashlib import sha1
-from typing import Any, IO, TextIO, Final, Generator
+from typing import Any, IO, TextIO, Generator
 import mwxml
 from mwxml import Dump
@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split
 import mwpersistence
 import mwreverts
 from pyarrow import Schema
 import tables
 from tables import RevisionTable
@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS = 7
 from deltas import SequenceMatcher, SegmentMatcher
 import dataclasses as dc
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.csv as pc
@ -193,93 +189,6 @@ class RegexPair(object):
        return temp_dict
 def pa_schema() -> pa.Schema:
    fields: list[pa.Field] = [
        pa.field("revid", pa.int64()),
        pa.field("date_time", pa.timestamp('s')),
        pa.field("articleid", pa.int64()),
        pa.field("editorid", pa.int64(), nullable=True),
        pa.field("title", pa.string()),
        pa.field("namespace", pa.int32()),
        pa.field("deleted", pa.bool_()),
        pa.field("text_chars", pa.int32()),
        pa.field("comment_chars", pa.int32()),
        pa.field("revert", pa.bool_(), nullable=True),
        # reverteds is a string which contains a comma-separated list of reverted revision ids.
        pa.field("reverteds", pa.string(), nullable=True),
        pa.field("sha1", pa.string()),
        pa.field("minor", pa.bool_()),
        pa.field("editor", pa.string()),
        pa.field("anon", pa.bool_())
    ]
    return pa.schema(fields)
 """
 We used to use a dictionary to collect fields for the output. 
 Now we use dataclasses. Compared to a dictionary, this should help:
 - prevent some bugs
 - make it easier to output parquet data. 
 - use class attribute '.' syntax instead of dictionary syntax. 
 - improve support for tooling (autocomplete, type hints)
 - use type information to define formatting rules
 Depending on the parameters passed into Wikiq, the output schema can be different. 
 Therefore, we need to end up constructing a dataclass with the correct output schema. 
 It also needs to have the correct pyarrow schema so we can write parquet files.
 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
 """
@dc.dataclass()
 class Revision:
    revid: int
    date_time: datetime
    articleid: int
    editorid: int
    title: str
    namespace: int
    deleted: bool
    text_chars: int | None = None
    comment_chars: int | None = None
    revert: bool | None = None
    reverteds: str = None
    sha1: str | None = None
    minor: bool | None = None
    editor: str | None = None
    anon: bool | None = None
    # defines pyarrow schema.
    # each field in the data class needs an entry in this array.
    # the names should match and be in the same order.
    # this isn't a dataclass field since it doesn't have a type annotation
    pa_schema_fields = [
        pa.field("revid", pa.int64()),
        pa.field("date_time", pa.timestamp('s')),
        pa.field("articleid", pa.int64()),
        pa.field("editorid", pa.int64(), nullable=True),
        pa.field("title", pa.string()),
        pa.field("namespace", pa.int32()),
        pa.field("deleted", pa.bool_()),
        pa.field("text_chars", pa.int32()),
        # pa.field("comment_chars", pa.int32()),
        pa.field("revert", pa.bool_(), nullable=True),
        # reverteds is a string which contains a comma-separated list of reverted revision ids.
        pa.field("reverteds", pa.string(), nullable=True),
        pa.field("sha1", pa.string()),
        pa.field("minor", pa.bool_()),
        pa.field("editor", pa.string()),
        pa.field("anon", pa.bool_())
    ]
    # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
    def to_pyarrow(self) -> pa.RecordBatch:
        d = dc.asdict(self)
        lists = [[d[field.name]] for field in self.pa_schema_fields]
        return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields))
 class WikiqParser:
    def __init__(self,
                 input_file: TextIOWrapper | IO[Any] | IO[bytes],
@ -311,7 +220,8 @@ class WikiqParser:
            self.namespace_filter = None
        self.regex_schemas = []
-        self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
+        self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision,
                                                                               regex_revision_label)
        self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
        # here we initialize the variables we need for output.
@ -512,9 +422,11 @@ class WikiqParser:
                buffer['tokens_window'] = []
                if self.persist == PersistMethod.sequence:
-                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
+                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.segment:
-                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
+                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence
                    state = persistence.State()
@ -551,7 +463,7 @@ class WikiqParser:
                    buffer['token_revs'].append(num_token_revs)
                    buffer['tokens_added'].append(num_tokens)
                    buffer['tokens_removed'].append(len(tokens_removed))
-                    buffer['tokens_window'].append(len(window) - (i+1))
+                    buffer['tokens_window'].append(len(window) - (i + 1))
            writer.write(pa.table(buffer, schema=schema))