Remove unused code
This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
b50c51a215
commit
8c707f5ef3
102
wikiq
102
wikiq
@ -8,14 +8,13 @@ import argparse
|
|||||||
import sys
|
import sys
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timezone
|
|
||||||
from io import TextIOWrapper
|
from io import TextIOWrapper
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
|
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
from typing import Any, IO, TextIO, Final, Generator
|
from typing import Any, IO, TextIO, Generator
|
||||||
|
|
||||||
import mwxml
|
import mwxml
|
||||||
from mwxml import Dump
|
from mwxml import Dump
|
||||||
@ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split
|
|||||||
import mwpersistence
|
import mwpersistence
|
||||||
import mwreverts
|
import mwreverts
|
||||||
|
|
||||||
from pyarrow import Schema
|
|
||||||
|
|
||||||
import tables
|
import tables
|
||||||
from tables import RevisionTable
|
from tables import RevisionTable
|
||||||
|
|
||||||
@ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor')
|
|||||||
PERSISTENCE_RADIUS = 7
|
PERSISTENCE_RADIUS = 7
|
||||||
from deltas import SequenceMatcher, SegmentMatcher
|
from deltas import SequenceMatcher, SegmentMatcher
|
||||||
|
|
||||||
import dataclasses as dc
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
import pyarrow.csv as pc
|
import pyarrow.csv as pc
|
||||||
@ -193,93 +189,6 @@ class RegexPair(object):
|
|||||||
return temp_dict
|
return temp_dict
|
||||||
|
|
||||||
|
|
||||||
def pa_schema() -> pa.Schema:
|
|
||||||
fields: list[pa.Field] = [
|
|
||||||
pa.field("revid", pa.int64()),
|
|
||||||
pa.field("date_time", pa.timestamp('s')),
|
|
||||||
pa.field("articleid", pa.int64()),
|
|
||||||
pa.field("editorid", pa.int64(), nullable=True),
|
|
||||||
pa.field("title", pa.string()),
|
|
||||||
pa.field("namespace", pa.int32()),
|
|
||||||
pa.field("deleted", pa.bool_()),
|
|
||||||
pa.field("text_chars", pa.int32()),
|
|
||||||
pa.field("comment_chars", pa.int32()),
|
|
||||||
pa.field("revert", pa.bool_(), nullable=True),
|
|
||||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
|
||||||
pa.field("reverteds", pa.string(), nullable=True),
|
|
||||||
pa.field("sha1", pa.string()),
|
|
||||||
pa.field("minor", pa.bool_()),
|
|
||||||
pa.field("editor", pa.string()),
|
|
||||||
pa.field("anon", pa.bool_())
|
|
||||||
]
|
|
||||||
return pa.schema(fields)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
We used to use a dictionary to collect fields for the output.
|
|
||||||
Now we use dataclasses. Compared to a dictionary, this should help:
|
|
||||||
- prevent some bugs
|
|
||||||
- make it easier to output parquet data.
|
|
||||||
- use class attribute '.' syntax instead of dictionary syntax.
|
|
||||||
- improve support for tooling (autocomplete, type hints)
|
|
||||||
- use type information to define formatting rules
|
|
||||||
|
|
||||||
Depending on the parameters passed into Wikiq, the output schema can be different.
|
|
||||||
Therefore, we need to end up constructing a dataclass with the correct output schema.
|
|
||||||
It also needs to have the correct pyarrow schema so we can write parquet files.
|
|
||||||
|
|
||||||
The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@dc.dataclass()
|
|
||||||
class Revision:
|
|
||||||
revid: int
|
|
||||||
date_time: datetime
|
|
||||||
articleid: int
|
|
||||||
editorid: int
|
|
||||||
title: str
|
|
||||||
namespace: int
|
|
||||||
deleted: bool
|
|
||||||
text_chars: int | None = None
|
|
||||||
comment_chars: int | None = None
|
|
||||||
revert: bool | None = None
|
|
||||||
reverteds: str = None
|
|
||||||
sha1: str | None = None
|
|
||||||
minor: bool | None = None
|
|
||||||
editor: str | None = None
|
|
||||||
anon: bool | None = None
|
|
||||||
|
|
||||||
# defines pyarrow schema.
|
|
||||||
# each field in the data class needs an entry in this array.
|
|
||||||
# the names should match and be in the same order.
|
|
||||||
# this isn't a dataclass field since it doesn't have a type annotation
|
|
||||||
pa_schema_fields = [
|
|
||||||
pa.field("revid", pa.int64()),
|
|
||||||
pa.field("date_time", pa.timestamp('s')),
|
|
||||||
pa.field("articleid", pa.int64()),
|
|
||||||
pa.field("editorid", pa.int64(), nullable=True),
|
|
||||||
pa.field("title", pa.string()),
|
|
||||||
pa.field("namespace", pa.int32()),
|
|
||||||
pa.field("deleted", pa.bool_()),
|
|
||||||
pa.field("text_chars", pa.int32()),
|
|
||||||
# pa.field("comment_chars", pa.int32()),
|
|
||||||
pa.field("revert", pa.bool_(), nullable=True),
|
|
||||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
|
||||||
pa.field("reverteds", pa.string(), nullable=True),
|
|
||||||
pa.field("sha1", pa.string()),
|
|
||||||
pa.field("minor", pa.bool_()),
|
|
||||||
pa.field("editor", pa.string()),
|
|
||||||
pa.field("anon", pa.bool_())
|
|
||||||
]
|
|
||||||
|
|
||||||
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
|
|
||||||
def to_pyarrow(self) -> pa.RecordBatch:
|
|
||||||
d = dc.asdict(self)
|
|
||||||
lists = [[d[field.name]] for field in self.pa_schema_fields]
|
|
||||||
return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields))
|
|
||||||
|
|
||||||
class WikiqParser:
|
class WikiqParser:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
input_file: TextIOWrapper | IO[Any] | IO[bytes],
|
input_file: TextIOWrapper | IO[Any] | IO[bytes],
|
||||||
@ -311,7 +220,8 @@ class WikiqParser:
|
|||||||
self.namespace_filter = None
|
self.namespace_filter = None
|
||||||
|
|
||||||
self.regex_schemas = []
|
self.regex_schemas = []
|
||||||
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
|
self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision,
|
||||||
|
regex_revision_label)
|
||||||
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
||||||
|
|
||||||
# here we initialize the variables we need for output.
|
# here we initialize the variables we need for output.
|
||||||
@ -512,9 +422,11 @@ class WikiqParser:
|
|||||||
buffer['tokens_window'] = []
|
buffer['tokens_window'] = []
|
||||||
|
|
||||||
if self.persist == PersistMethod.sequence:
|
if self.persist == PersistMethod.sequence:
|
||||||
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
|
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
|
||||||
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
elif self.persist == PersistMethod.segment:
|
elif self.persist == PersistMethod.segment:
|
||||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS)
|
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
||||||
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
else:
|
else:
|
||||||
from mw.lib import persistence
|
from mw.lib import persistence
|
||||||
state = persistence.State()
|
state = persistence.State()
|
||||||
|
Loading…
Reference in New Issue
Block a user