Remove unused code

This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
2025-06-03 17:20:05 -05:00 · 2025-06-03 17:20:05 -05:00 · 8c707f5ef3
commit 8c707f5ef3
parent b50c51a215
1 changed files with 8 additions and 96 deletions
--- a/104
+++ b/104
@ -8,14 +8,13 @@ import argparse
 import sys
 import os.path
 import re
-from datetime import datetime, timezone
 from io import TextIOWrapper
 from itertools import groupby

 from subprocess import Popen, PIPE
 from collections import deque
 from hashlib import sha1
-from typing import Any, IO, TextIO, Final, Generator
+from typing import Any, IO, TextIO, Generator

 import mwxml
 from mwxml import Dump
@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split
 import mwpersistence
 import mwreverts

-from pyarrow import Schema
-
 import tables
 from tables import RevisionTable

@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS = 7
 from deltas import SequenceMatcher, SegmentMatcher

-import dataclasses as dc
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.csv as pc
@ -193,93 +189,6 @@ class RegexPair(object):
        return temp_dict


-def pa_schema() -> pa.Schema:
-    fields: list[pa.Field] = [
-        pa.field("revid", pa.int64()),
-        pa.field("date_time", pa.timestamp('s')),
-        pa.field("articleid", pa.int64()),
-        pa.field("editorid", pa.int64(), nullable=True),
-        pa.field("title", pa.string()),
-        pa.field("namespace", pa.int32()),
-        pa.field("deleted", pa.bool_()),
-        pa.field("text_chars", pa.int32()),
-        pa.field("comment_chars", pa.int32()),
-        pa.field("revert", pa.bool_(), nullable=True),
-        # reverteds is a string which contains a comma-separated list of reverted revision ids.
-        pa.field("reverteds", pa.string(), nullable=True),
-        pa.field("sha1", pa.string()),
-        pa.field("minor", pa.bool_()),
-        pa.field("editor", pa.string()),
-        pa.field("anon", pa.bool_())
-    ]
-    return pa.schema(fields)
-
-
-"""
-
-We used to use a dictionary to collect fields for the output. 
-Now we use dataclasses. Compared to a dictionary, this should help:
- prevent some bugs
- make it easier to output parquet data. 
- use class attribute '.' syntax instead of dictionary syntax. 
- improve support for tooling (autocomplete, type hints)
- use type information to define formatting rules
-
-Depending on the parameters passed into Wikiq, the output schema can be different. 
-Therefore, we need to end up constructing a dataclass with the correct output schema. 
-It also needs to have the correct pyarrow schema so we can write parquet files.
-
-The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
-"""
-
-
-@dc.dataclass()
-class Revision:
-    revid: int
-    date_time: datetime
-    articleid: int
-    editorid: int
-    title: str
-    namespace: int
-    deleted: bool
-    text_chars: int | None = None
-    comment_chars: int | None = None
-    revert: bool | None = None
-    reverteds: str = None
-    sha1: str | None = None
-    minor: bool | None = None
-    editor: str | None = None
-    anon: bool | None = None
-
-    # defines pyarrow schema.
-    # each field in the data class needs an entry in this array.
-    # the names should match and be in the same order.
-    # this isn't a dataclass field since it doesn't have a type annotation
-    pa_schema_fields = [
-        pa.field("revid", pa.int64()),
-        pa.field("date_time", pa.timestamp('s')),
-        pa.field("articleid", pa.int64()),
-        pa.field("editorid", pa.int64(), nullable=True),
-        pa.field("title", pa.string()),
-        pa.field("namespace", pa.int32()),
-        pa.field("deleted", pa.bool_()),
-        pa.field("text_chars", pa.int32()),
-        # pa.field("comment_chars", pa.int32()),
-        pa.field("revert", pa.bool_(), nullable=True),
-        # reverteds is a string which contains a comma-separated list of reverted revision ids.
-        pa.field("reverteds", pa.string(), nullable=True),
-        pa.field("sha1", pa.string()),
-        pa.field("minor", pa.bool_()),
-        pa.field("editor", pa.string()),
-        pa.field("anon", pa.bool_())
-    ]
-
-    # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
-    def to_pyarrow(self) -> pa.RecordBatch:
-        d = dc.asdict(self)
-        lists = [[d[field.name]] for field in self.pa_schema_fields]
-        return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields))
-
 class WikiqParser:
    def __init__(self,
                 input_file: TextIOWrapper | IO[Any] | IO[bytes],
@ -311,7 +220,8 @@ class WikiqParser:
            self.namespace_filter = None

        self.regex_schemas = []
-        self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
+        self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision,
+                                                                               regex_revision_label)
        self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)

        # here we initialize the variables we need for output.
@ -512,9 +422,11 @@ class WikiqParser:
                buffer['tokens_window'] = []

                if self.persist == PersistMethod.sequence:
-                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
+                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
+                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.segment:
-                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
+                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
+                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence
                    state = persistence.State()
@ -551,7 +463,7 @@ class WikiqParser:
                    buffer['token_revs'].append(num_token_revs)
                    buffer['tokens_added'].append(num_tokens)
                    buffer['tokens_removed'].append(len(tokens_removed))
-                    buffer['tokens_window'].append(len(window) - (i+1))
+                    buffer['tokens_window'].append(len(window) - (i + 1))

            writer.write(pa.table(buffer, schema=schema))