Remove unused code

This should help PR readability.

There is likely still some unused code, but that should be the
bulk of it.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-03 17:20:05 -05:00
parent b50c51a215
commit 8c707f5ef3

104
wikiq
View File

@ -8,14 +8,13 @@ import argparse
import sys
import os.path
import re
from datetime import datetime, timezone
from io import TextIOWrapper
from itertools import groupby
from subprocess import Popen, PIPE
from collections import deque
from hashlib import sha1
from typing import Any, IO, TextIO, Final, Generator
from typing import Any, IO, TextIO, Generator
import mwxml
from mwxml import Dump
@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split
import mwpersistence
import mwreverts
from pyarrow import Schema
import tables
from tables import RevisionTable
@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor')
PERSISTENCE_RADIUS = 7
from deltas import SequenceMatcher, SegmentMatcher
import dataclasses as dc
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pc
@ -193,93 +189,6 @@ class RegexPair(object):
return temp_dict
def pa_schema() -> pa.Schema:
fields: list[pa.Field] = [
pa.field("revid", pa.int64()),
pa.field("date_time", pa.timestamp('s')),
pa.field("articleid", pa.int64()),
pa.field("editorid", pa.int64(), nullable=True),
pa.field("title", pa.string()),
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("text_chars", pa.int32()),
pa.field("comment_chars", pa.int32()),
pa.field("revert", pa.bool_(), nullable=True),
# reverteds is a string which contains a comma-separated list of reverted revision ids.
pa.field("reverteds", pa.string(), nullable=True),
pa.field("sha1", pa.string()),
pa.field("minor", pa.bool_()),
pa.field("editor", pa.string()),
pa.field("anon", pa.bool_())
]
return pa.schema(fields)
"""
We used to use a dictionary to collect fields for the output.
Now we use dataclasses. Compared to a dictionary, this should help:
- prevent some bugs
- make it easier to output parquet data.
- use class attribute '.' syntax instead of dictionary syntax.
- improve support for tooling (autocomplete, type hints)
- use type information to define formatting rules
Depending on the parameters passed into Wikiq, the output schema can be different.
Therefore, we need to end up constructing a dataclass with the correct output schema.
It also needs to have the correct pyarrow schema so we can write parquet files.
The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
"""
@dc.dataclass()
class Revision:
revid: int
date_time: datetime
articleid: int
editorid: int
title: str
namespace: int
deleted: bool
text_chars: int | None = None
comment_chars: int | None = None
revert: bool | None = None
reverteds: str = None
sha1: str | None = None
minor: bool | None = None
editor: str | None = None
anon: bool | None = None
# defines pyarrow schema.
# each field in the data class needs an entry in this array.
# the names should match and be in the same order.
# this isn't a dataclass field since it doesn't have a type annotation
pa_schema_fields = [
pa.field("revid", pa.int64()),
pa.field("date_time", pa.timestamp('s')),
pa.field("articleid", pa.int64()),
pa.field("editorid", pa.int64(), nullable=True),
pa.field("title", pa.string()),
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("text_chars", pa.int32()),
# pa.field("comment_chars", pa.int32()),
pa.field("revert", pa.bool_(), nullable=True),
# reverteds is a string which contains a comma-separated list of reverted revision ids.
pa.field("reverteds", pa.string(), nullable=True),
pa.field("sha1", pa.string()),
pa.field("minor", pa.bool_()),
pa.field("editor", pa.string()),
pa.field("anon", pa.bool_())
]
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
def to_pyarrow(self) -> pa.RecordBatch:
d = dc.asdict(self)
lists = [[d[field.name]] for field in self.pa_schema_fields]
return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields))
class WikiqParser:
def __init__(self,
input_file: TextIOWrapper | IO[Any] | IO[bytes],
@ -311,7 +220,8 @@ class WikiqParser:
self.namespace_filter = None
self.regex_schemas = []
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision,
regex_revision_label)
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
# here we initialize the variables we need for output.
@ -512,9 +422,11 @@ class WikiqParser:
buffer['tokens_window'] = []
if self.persist == PersistMethod.sequence:
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.segment:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
else:
from mw.lib import persistence
state = persistence.State()
@ -551,7 +463,7 @@ class WikiqParser:
buffer['token_revs'].append(num_token_revs)
buffer['tokens_added'].append(num_tokens)
buffer['tokens_removed'].append(len(tokens_removed))
buffer['tokens_window'].append(len(window) - (i+1))
buffer['tokens_window'].append(len(window) - (i + 1))
writer.write(pa.table(buffer, schema=schema))