Remove unused code
This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									b50c51a215
								
							
						
					
					
						commit
						8c707f5ef3
					
				
							
								
								
									
										104
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										104
									
								
								wikiq
									
									
									
									
									
								
							| @ -8,14 +8,13 @@ import argparse | ||||
| import sys | ||||
| import os.path | ||||
| import re | ||||
| from datetime import datetime, timezone | ||||
| from io import TextIOWrapper | ||||
| from itertools import groupby | ||||
| 
 | ||||
| from subprocess import Popen, PIPE | ||||
| from collections import deque | ||||
| from hashlib import sha1 | ||||
| from typing import Any, IO, TextIO, Final, Generator | ||||
| from typing import Any, IO, TextIO, Generator | ||||
| 
 | ||||
| import mwxml | ||||
| from mwxml import Dump | ||||
| @ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split | ||||
| import mwpersistence | ||||
| import mwreverts | ||||
| 
 | ||||
| from pyarrow import Schema | ||||
| 
 | ||||
| import tables | ||||
| from tables import RevisionTable | ||||
| 
 | ||||
| @ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor') | ||||
| PERSISTENCE_RADIUS = 7 | ||||
| from deltas import SequenceMatcher, SegmentMatcher | ||||
| 
 | ||||
| import dataclasses as dc | ||||
| import pyarrow as pa | ||||
| import pyarrow.parquet as pq | ||||
| import pyarrow.csv as pc | ||||
| @ -193,93 +189,6 @@ class RegexPair(object): | ||||
|         return temp_dict | ||||
| 
 | ||||
| 
 | ||||
| def pa_schema() -> pa.Schema: | ||||
|     fields: list[pa.Field] = [ | ||||
|         pa.field("revid", pa.int64()), | ||||
|         pa.field("date_time", pa.timestamp('s')), | ||||
|         pa.field("articleid", pa.int64()), | ||||
|         pa.field("editorid", pa.int64(), nullable=True), | ||||
|         pa.field("title", pa.string()), | ||||
|         pa.field("namespace", pa.int32()), | ||||
|         pa.field("deleted", pa.bool_()), | ||||
|         pa.field("text_chars", pa.int32()), | ||||
|         pa.field("comment_chars", pa.int32()), | ||||
|         pa.field("revert", pa.bool_(), nullable=True), | ||||
|         # reverteds is a string which contains a comma-separated list of reverted revision ids. | ||||
|         pa.field("reverteds", pa.string(), nullable=True), | ||||
|         pa.field("sha1", pa.string()), | ||||
|         pa.field("minor", pa.bool_()), | ||||
|         pa.field("editor", pa.string()), | ||||
|         pa.field("anon", pa.bool_()) | ||||
|     ] | ||||
|     return pa.schema(fields) | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| We used to use a dictionary to collect fields for the output.  | ||||
| Now we use dataclasses. Compared to a dictionary, this should help: | ||||
| - prevent some bugs | ||||
| - make it easier to output parquet data.  | ||||
| - use class attribute '.' syntax instead of dictionary syntax.  | ||||
| - improve support for tooling (autocomplete, type hints) | ||||
| - use type information to define formatting rules | ||||
| 
 | ||||
| Depending on the parameters passed into Wikiq, the output schema can be different.  | ||||
| Therefore, we need to end up constructing a dataclass with the correct output schema.  | ||||
| It also needs to have the correct pyarrow schema so we can write parquet files. | ||||
| 
 | ||||
| The RevDataBase type has all the fields that will be output no matter how wikiq is invoked. | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| @dc.dataclass() | ||||
| class Revision: | ||||
|     revid: int | ||||
|     date_time: datetime | ||||
|     articleid: int | ||||
|     editorid: int | ||||
|     title: str | ||||
|     namespace: int | ||||
|     deleted: bool | ||||
|     text_chars: int | None = None | ||||
|     comment_chars: int | None = None | ||||
|     revert: bool | None = None | ||||
|     reverteds: str = None | ||||
|     sha1: str | None = None | ||||
|     minor: bool | None = None | ||||
|     editor: str | None = None | ||||
|     anon: bool | None = None | ||||
| 
 | ||||
|     # defines pyarrow schema. | ||||
|     # each field in the data class needs an entry in this array. | ||||
|     # the names should match and be in the same order. | ||||
|     # this isn't a dataclass field since it doesn't have a type annotation | ||||
|     pa_schema_fields = [ | ||||
|         pa.field("revid", pa.int64()), | ||||
|         pa.field("date_time", pa.timestamp('s')), | ||||
|         pa.field("articleid", pa.int64()), | ||||
|         pa.field("editorid", pa.int64(), nullable=True), | ||||
|         pa.field("title", pa.string()), | ||||
|         pa.field("namespace", pa.int32()), | ||||
|         pa.field("deleted", pa.bool_()), | ||||
|         pa.field("text_chars", pa.int32()), | ||||
|         # pa.field("comment_chars", pa.int32()), | ||||
|         pa.field("revert", pa.bool_(), nullable=True), | ||||
|         # reverteds is a string which contains a comma-separated list of reverted revision ids. | ||||
|         pa.field("reverteds", pa.string(), nullable=True), | ||||
|         pa.field("sha1", pa.string()), | ||||
|         pa.field("minor", pa.bool_()), | ||||
|         pa.field("editor", pa.string()), | ||||
|         pa.field("anon", pa.bool_()) | ||||
|     ] | ||||
| 
 | ||||
|     # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function | ||||
|     def to_pyarrow(self) -> pa.RecordBatch: | ||||
|         d = dc.asdict(self) | ||||
|         lists = [[d[field.name]] for field in self.pa_schema_fields] | ||||
|         return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields)) | ||||
| 
 | ||||
| class WikiqParser: | ||||
|     def __init__(self, | ||||
|                  input_file: TextIOWrapper | IO[Any] | IO[bytes], | ||||
| @ -311,7 +220,8 @@ class WikiqParser: | ||||
|             self.namespace_filter = None | ||||
| 
 | ||||
|         self.regex_schemas = [] | ||||
|         self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) | ||||
|         self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, | ||||
|                                                                                regex_revision_label) | ||||
|         self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) | ||||
| 
 | ||||
|         # here we initialize the variables we need for output. | ||||
| @ -512,9 +422,11 @@ class WikiqParser: | ||||
|                 buffer['tokens_window'] = [] | ||||
| 
 | ||||
|                 if self.persist == PersistMethod.sequence: | ||||
|                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) | ||||
|                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), | ||||
|                                                     revert_radius=PERSISTENCE_RADIUS) | ||||
|                 elif self.persist == PersistMethod.segment: | ||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) | ||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||
|                                                     revert_radius=PERSISTENCE_RADIUS) | ||||
|                 else: | ||||
|                     from mw.lib import persistence | ||||
|                     state = persistence.State() | ||||
| @ -551,7 +463,7 @@ class WikiqParser: | ||||
|                     buffer['token_revs'].append(num_token_revs) | ||||
|                     buffer['tokens_added'].append(num_tokens) | ||||
|                     buffer['tokens_removed'].append(len(tokens_removed)) | ||||
|                     buffer['tokens_window'].append(len(window) - (i+1)) | ||||
|                     buffer['tokens_window'].append(len(window) - (i + 1)) | ||||
| 
 | ||||
|             writer.write(pa.table(buffer, schema=schema)) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user