diff --git a/wikiq b/wikiq index 3c1c904..a171cbf 100755 --- a/wikiq +++ b/wikiq @@ -8,14 +8,13 @@ import argparse import sys import os.path import re -from datetime import datetime, timezone from io import TextIOWrapper from itertools import groupby from subprocess import Popen, PIPE from collections import deque from hashlib import sha1 -from typing import Any, IO, TextIO, Final, Generator +from typing import Any, IO, TextIO, Generator import mwxml from mwxml import Dump @@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split import mwpersistence import mwreverts -from pyarrow import Schema - import tables from tables import RevisionTable @@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS = 7 from deltas import SequenceMatcher, SegmentMatcher -import dataclasses as dc import pyarrow as pa import pyarrow.parquet as pq import pyarrow.csv as pc @@ -193,93 +189,6 @@ class RegexPair(object): return temp_dict -def pa_schema() -> pa.Schema: - fields: list[pa.Field] = [ - pa.field("revid", pa.int64()), - pa.field("date_time", pa.timestamp('s')), - pa.field("articleid", pa.int64()), - pa.field("editorid", pa.int64(), nullable=True), - pa.field("title", pa.string()), - pa.field("namespace", pa.int32()), - pa.field("deleted", pa.bool_()), - pa.field("text_chars", pa.int32()), - pa.field("comment_chars", pa.int32()), - pa.field("revert", pa.bool_(), nullable=True), - # reverteds is a string which contains a comma-separated list of reverted revision ids. - pa.field("reverteds", pa.string(), nullable=True), - pa.field("sha1", pa.string()), - pa.field("minor", pa.bool_()), - pa.field("editor", pa.string()), - pa.field("anon", pa.bool_()) - ] - return pa.schema(fields) - - -""" - -We used to use a dictionary to collect fields for the output. -Now we use dataclasses. Compared to a dictionary, this should help: -- prevent some bugs -- make it easier to output parquet data. -- use class attribute '.' syntax instead of dictionary syntax. -- improve support for tooling (autocomplete, type hints) -- use type information to define formatting rules - -Depending on the parameters passed into Wikiq, the output schema can be different. -Therefore, we need to end up constructing a dataclass with the correct output schema. -It also needs to have the correct pyarrow schema so we can write parquet files. - -The RevDataBase type has all the fields that will be output no matter how wikiq is invoked. -""" - - -@dc.dataclass() -class Revision: - revid: int - date_time: datetime - articleid: int - editorid: int - title: str - namespace: int - deleted: bool - text_chars: int | None = None - comment_chars: int | None = None - revert: bool | None = None - reverteds: str = None - sha1: str | None = None - minor: bool | None = None - editor: str | None = None - anon: bool | None = None - - # defines pyarrow schema. - # each field in the data class needs an entry in this array. - # the names should match and be in the same order. - # this isn't a dataclass field since it doesn't have a type annotation - pa_schema_fields = [ - pa.field("revid", pa.int64()), - pa.field("date_time", pa.timestamp('s')), - pa.field("articleid", pa.int64()), - pa.field("editorid", pa.int64(), nullable=True), - pa.field("title", pa.string()), - pa.field("namespace", pa.int32()), - pa.field("deleted", pa.bool_()), - pa.field("text_chars", pa.int32()), - # pa.field("comment_chars", pa.int32()), - pa.field("revert", pa.bool_(), nullable=True), - # reverteds is a string which contains a comma-separated list of reverted revision ids. - pa.field("reverteds", pa.string(), nullable=True), - pa.field("sha1", pa.string()), - pa.field("minor", pa.bool_()), - pa.field("editor", pa.string()), - pa.field("anon", pa.bool_()) - ] - - # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function - def to_pyarrow(self) -> pa.RecordBatch: - d = dc.asdict(self) - lists = [[d[field.name]] for field in self.pa_schema_fields] - return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields)) - class WikiqParser: def __init__(self, input_file: TextIOWrapper | IO[Any] | IO[bytes], @@ -311,7 +220,8 @@ class WikiqParser: self.namespace_filter = None self.regex_schemas = [] - self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) + self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, + regex_revision_label) self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) # here we initialize the variables we need for output. @@ -512,9 +422,11 @@ class WikiqParser: buffer['tokens_window'] = [] if self.persist == PersistMethod.sequence: - state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) + state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), + revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.segment: - state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) + state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), + revert_radius=PERSISTENCE_RADIUS) else: from mw.lib import persistence state = persistence.State() @@ -551,7 +463,7 @@ class WikiqParser: buffer['token_revs'].append(num_token_revs) buffer['tokens_added'].append(num_tokens) buffer['tokens_removed'].append(len(tokens_removed)) - buffer['tokens_window'].append(len(window) - (i+1)) + buffer['tokens_window'].append(len(window) - (i + 1)) writer.write(pa.table(buffer, schema=schema))