Remove unused code
This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
b50c51a215
commit
8c707f5ef3
104
wikiq
104
wikiq
@ -8,14 +8,13 @@ import argparse
|
||||
import sys
|
||||
import os.path
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from io import TextIOWrapper
|
||||
from itertools import groupby
|
||||
|
||||
from subprocess import Popen, PIPE
|
||||
from collections import deque
|
||||
from hashlib import sha1
|
||||
from typing import Any, IO, TextIO, Final, Generator
|
||||
from typing import Any, IO, TextIO, Generator
|
||||
|
||||
import mwxml
|
||||
from mwxml import Dump
|
||||
@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split
|
||||
import mwpersistence
|
||||
import mwreverts
|
||||
|
||||
from pyarrow import Schema
|
||||
|
||||
import tables
|
||||
from tables import RevisionTable
|
||||
|
||||
@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor')
|
||||
PERSISTENCE_RADIUS = 7
|
||||
from deltas import SequenceMatcher, SegmentMatcher
|
||||
|
||||
import dataclasses as dc
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.csv as pc
|
||||
@ -193,93 +189,6 @@ class RegexPair(object):
|
||||
return temp_dict
|
||||
|
||||
|
||||
def pa_schema() -> pa.Schema:
|
||||
fields: list[pa.Field] = [
|
||||
pa.field("revid", pa.int64()),
|
||||
pa.field("date_time", pa.timestamp('s')),
|
||||
pa.field("articleid", pa.int64()),
|
||||
pa.field("editorid", pa.int64(), nullable=True),
|
||||
pa.field("title", pa.string()),
|
||||
pa.field("namespace", pa.int32()),
|
||||
pa.field("deleted", pa.bool_()),
|
||||
pa.field("text_chars", pa.int32()),
|
||||
pa.field("comment_chars", pa.int32()),
|
||||
pa.field("revert", pa.bool_(), nullable=True),
|
||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
||||
pa.field("reverteds", pa.string(), nullable=True),
|
||||
pa.field("sha1", pa.string()),
|
||||
pa.field("minor", pa.bool_()),
|
||||
pa.field("editor", pa.string()),
|
||||
pa.field("anon", pa.bool_())
|
||||
]
|
||||
return pa.schema(fields)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
We used to use a dictionary to collect fields for the output.
|
||||
Now we use dataclasses. Compared to a dictionary, this should help:
|
||||
- prevent some bugs
|
||||
- make it easier to output parquet data.
|
||||
- use class attribute '.' syntax instead of dictionary syntax.
|
||||
- improve support for tooling (autocomplete, type hints)
|
||||
- use type information to define formatting rules
|
||||
|
||||
Depending on the parameters passed into Wikiq, the output schema can be different.
|
||||
Therefore, we need to end up constructing a dataclass with the correct output schema.
|
||||
It also needs to have the correct pyarrow schema so we can write parquet files.
|
||||
|
||||
The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
|
||||
"""
|
||||
|
||||
|
||||
@dc.dataclass()
|
||||
class Revision:
|
||||
revid: int
|
||||
date_time: datetime
|
||||
articleid: int
|
||||
editorid: int
|
||||
title: str
|
||||
namespace: int
|
||||
deleted: bool
|
||||
text_chars: int | None = None
|
||||
comment_chars: int | None = None
|
||||
revert: bool | None = None
|
||||
reverteds: str = None
|
||||
sha1: str | None = None
|
||||
minor: bool | None = None
|
||||
editor: str | None = None
|
||||
anon: bool | None = None
|
||||
|
||||
# defines pyarrow schema.
|
||||
# each field in the data class needs an entry in this array.
|
||||
# the names should match and be in the same order.
|
||||
# this isn't a dataclass field since it doesn't have a type annotation
|
||||
pa_schema_fields = [
|
||||
pa.field("revid", pa.int64()),
|
||||
pa.field("date_time", pa.timestamp('s')),
|
||||
pa.field("articleid", pa.int64()),
|
||||
pa.field("editorid", pa.int64(), nullable=True),
|
||||
pa.field("title", pa.string()),
|
||||
pa.field("namespace", pa.int32()),
|
||||
pa.field("deleted", pa.bool_()),
|
||||
pa.field("text_chars", pa.int32()),
|
||||
# pa.field("comment_chars", pa.int32()),
|
||||
pa.field("revert", pa.bool_(), nullable=True),
|
||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
||||
pa.field("reverteds", pa.string(), nullable=True),
|
||||
pa.field("sha1", pa.string()),
|
||||
pa.field("minor", pa.bool_()),
|
||||
pa.field("editor", pa.string()),
|
||||
pa.field("anon", pa.bool_())
|
||||
]
|
||||
|
||||
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
|
||||
def to_pyarrow(self) -> pa.RecordBatch:
|
||||
d = dc.asdict(self)
|
||||
lists = [[d[field.name]] for field in self.pa_schema_fields]
|
||||
return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields))
|
||||
|
||||
class WikiqParser:
|
||||
def __init__(self,
|
||||
input_file: TextIOWrapper | IO[Any] | IO[bytes],
|
||||
@ -311,7 +220,8 @@ class WikiqParser:
|
||||
self.namespace_filter = None
|
||||
|
||||
self.regex_schemas = []
|
||||
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
|
||||
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision,
|
||||
regex_revision_label)
|
||||
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
||||
|
||||
# here we initialize the variables we need for output.
|
||||
@ -512,9 +422,11 @@ class WikiqParser:
|
||||
buffer['tokens_window'] = []
|
||||
|
||||
if self.persist == PersistMethod.sequence:
|
||||
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
|
||||
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
|
||||
revert_radius=PERSISTENCE_RADIUS)
|
||||
elif self.persist == PersistMethod.segment:
|
||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
|
||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
||||
revert_radius=PERSISTENCE_RADIUS)
|
||||
else:
|
||||
from mw.lib import persistence
|
||||
state = persistence.State()
|
||||
@ -551,7 +463,7 @@ class WikiqParser:
|
||||
buffer['token_revs'].append(num_token_revs)
|
||||
buffer['tokens_added'].append(num_tokens)
|
||||
buffer['tokens_removed'].append(len(tokens_removed))
|
||||
buffer['tokens_window'].append(len(window) - (i+1))
|
||||
buffer['tokens_window'].append(len(window) - (i + 1))
|
||||
|
||||
writer.write(pa.table(buffer, schema=schema))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user