Remove unused code
This should help PR readability. There is likely still some unused code, but that should be the bulk of it. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
		
							parent
							
								
									b50c51a215
								
							
						
					
					
						commit
						8c707f5ef3
					
				
							
								
								
									
										104
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										104
									
								
								wikiq
									
									
									
									
									
								
							| @ -8,14 +8,13 @@ import argparse | |||||||
| import sys | import sys | ||||||
| import os.path | import os.path | ||||||
| import re | import re | ||||||
| from datetime import datetime, timezone |  | ||||||
| from io import TextIOWrapper | from io import TextIOWrapper | ||||||
| from itertools import groupby | from itertools import groupby | ||||||
| 
 | 
 | ||||||
| from subprocess import Popen, PIPE | from subprocess import Popen, PIPE | ||||||
| from collections import deque | from collections import deque | ||||||
| from hashlib import sha1 | from hashlib import sha1 | ||||||
| from typing import Any, IO, TextIO, Final, Generator | from typing import Any, IO, TextIO, Generator | ||||||
| 
 | 
 | ||||||
| import mwxml | import mwxml | ||||||
| from mwxml import Dump | from mwxml import Dump | ||||||
| @ -24,8 +23,6 @@ from deltas.tokenizers import wikitext_split | |||||||
| import mwpersistence | import mwpersistence | ||||||
| import mwreverts | import mwreverts | ||||||
| 
 | 
 | ||||||
| from pyarrow import Schema |  | ||||||
| 
 |  | ||||||
| import tables | import tables | ||||||
| from tables import RevisionTable | from tables import RevisionTable | ||||||
| 
 | 
 | ||||||
| @ -33,7 +30,6 @@ TO_ENCODE = ('title', 'editor') | |||||||
| PERSISTENCE_RADIUS = 7 | PERSISTENCE_RADIUS = 7 | ||||||
| from deltas import SequenceMatcher, SegmentMatcher | from deltas import SequenceMatcher, SegmentMatcher | ||||||
| 
 | 
 | ||||||
| import dataclasses as dc |  | ||||||
| import pyarrow as pa | import pyarrow as pa | ||||||
| import pyarrow.parquet as pq | import pyarrow.parquet as pq | ||||||
| import pyarrow.csv as pc | import pyarrow.csv as pc | ||||||
| @ -193,93 +189,6 @@ class RegexPair(object): | |||||||
|         return temp_dict |         return temp_dict | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def pa_schema() -> pa.Schema: |  | ||||||
|     fields: list[pa.Field] = [ |  | ||||||
|         pa.field("revid", pa.int64()), |  | ||||||
|         pa.field("date_time", pa.timestamp('s')), |  | ||||||
|         pa.field("articleid", pa.int64()), |  | ||||||
|         pa.field("editorid", pa.int64(), nullable=True), |  | ||||||
|         pa.field("title", pa.string()), |  | ||||||
|         pa.field("namespace", pa.int32()), |  | ||||||
|         pa.field("deleted", pa.bool_()), |  | ||||||
|         pa.field("text_chars", pa.int32()), |  | ||||||
|         pa.field("comment_chars", pa.int32()), |  | ||||||
|         pa.field("revert", pa.bool_(), nullable=True), |  | ||||||
|         # reverteds is a string which contains a comma-separated list of reverted revision ids. |  | ||||||
|         pa.field("reverteds", pa.string(), nullable=True), |  | ||||||
|         pa.field("sha1", pa.string()), |  | ||||||
|         pa.field("minor", pa.bool_()), |  | ||||||
|         pa.field("editor", pa.string()), |  | ||||||
|         pa.field("anon", pa.bool_()) |  | ||||||
|     ] |  | ||||||
|     return pa.schema(fields) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| We used to use a dictionary to collect fields for the output.  |  | ||||||
| Now we use dataclasses. Compared to a dictionary, this should help: |  | ||||||
| - prevent some bugs |  | ||||||
| - make it easier to output parquet data.  |  | ||||||
| - use class attribute '.' syntax instead of dictionary syntax.  |  | ||||||
| - improve support for tooling (autocomplete, type hints) |  | ||||||
| - use type information to define formatting rules |  | ||||||
| 
 |  | ||||||
| Depending on the parameters passed into Wikiq, the output schema can be different.  |  | ||||||
| Therefore, we need to end up constructing a dataclass with the correct output schema.  |  | ||||||
| It also needs to have the correct pyarrow schema so we can write parquet files. |  | ||||||
| 
 |  | ||||||
| The RevDataBase type has all the fields that will be output no matter how wikiq is invoked. |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @dc.dataclass() |  | ||||||
| class Revision: |  | ||||||
|     revid: int |  | ||||||
|     date_time: datetime |  | ||||||
|     articleid: int |  | ||||||
|     editorid: int |  | ||||||
|     title: str |  | ||||||
|     namespace: int |  | ||||||
|     deleted: bool |  | ||||||
|     text_chars: int | None = None |  | ||||||
|     comment_chars: int | None = None |  | ||||||
|     revert: bool | None = None |  | ||||||
|     reverteds: str = None |  | ||||||
|     sha1: str | None = None |  | ||||||
|     minor: bool | None = None |  | ||||||
|     editor: str | None = None |  | ||||||
|     anon: bool | None = None |  | ||||||
| 
 |  | ||||||
|     # defines pyarrow schema. |  | ||||||
|     # each field in the data class needs an entry in this array. |  | ||||||
|     # the names should match and be in the same order. |  | ||||||
|     # this isn't a dataclass field since it doesn't have a type annotation |  | ||||||
|     pa_schema_fields = [ |  | ||||||
|         pa.field("revid", pa.int64()), |  | ||||||
|         pa.field("date_time", pa.timestamp('s')), |  | ||||||
|         pa.field("articleid", pa.int64()), |  | ||||||
|         pa.field("editorid", pa.int64(), nullable=True), |  | ||||||
|         pa.field("title", pa.string()), |  | ||||||
|         pa.field("namespace", pa.int32()), |  | ||||||
|         pa.field("deleted", pa.bool_()), |  | ||||||
|         pa.field("text_chars", pa.int32()), |  | ||||||
|         # pa.field("comment_chars", pa.int32()), |  | ||||||
|         pa.field("revert", pa.bool_(), nullable=True), |  | ||||||
|         # reverteds is a string which contains a comma-separated list of reverted revision ids. |  | ||||||
|         pa.field("reverteds", pa.string(), nullable=True), |  | ||||||
|         pa.field("sha1", pa.string()), |  | ||||||
|         pa.field("minor", pa.bool_()), |  | ||||||
|         pa.field("editor", pa.string()), |  | ||||||
|         pa.field("anon", pa.bool_()) |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function |  | ||||||
|     def to_pyarrow(self) -> pa.RecordBatch: |  | ||||||
|         d = dc.asdict(self) |  | ||||||
|         lists = [[d[field.name]] for field in self.pa_schema_fields] |  | ||||||
|         return pa.record_batch(lists, schema=pa.schema(self.pa_schema_fields)) |  | ||||||
| 
 |  | ||||||
| class WikiqParser: | class WikiqParser: | ||||||
|     def __init__(self, |     def __init__(self, | ||||||
|                  input_file: TextIOWrapper | IO[Any] | IO[bytes], |                  input_file: TextIOWrapper | IO[Any] | IO[bytes], | ||||||
| @ -311,7 +220,8 @@ class WikiqParser: | |||||||
|             self.namespace_filter = None |             self.namespace_filter = None | ||||||
| 
 | 
 | ||||||
|         self.regex_schemas = [] |         self.regex_schemas = [] | ||||||
|         self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) |         self.regex_revision_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_revision, | ||||||
|  |                                                                                regex_revision_label) | ||||||
|         self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) |         self.regex_comment_pairs: list[RegexPair] = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) | ||||||
| 
 | 
 | ||||||
|         # here we initialize the variables we need for output. |         # here we initialize the variables we need for output. | ||||||
| @ -512,9 +422,11 @@ class WikiqParser: | |||||||
|                 buffer['tokens_window'] = [] |                 buffer['tokens_window'] = [] | ||||||
| 
 | 
 | ||||||
|                 if self.persist == PersistMethod.sequence: |                 if self.persist == PersistMethod.sequence: | ||||||
|                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) |                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), | ||||||
|  |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 elif self.persist == PersistMethod.segment: |                 elif self.persist == PersistMethod.segment: | ||||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) |                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||||
|  |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 else: |                 else: | ||||||
|                     from mw.lib import persistence |                     from mw.lib import persistence | ||||||
|                     state = persistence.State() |                     state = persistence.State() | ||||||
| @ -551,7 +463,7 @@ class WikiqParser: | |||||||
|                     buffer['token_revs'].append(num_token_revs) |                     buffer['token_revs'].append(num_token_revs) | ||||||
|                     buffer['tokens_added'].append(num_tokens) |                     buffer['tokens_added'].append(num_tokens) | ||||||
|                     buffer['tokens_removed'].append(len(tokens_removed)) |                     buffer['tokens_removed'].append(len(tokens_removed)) | ||||||
|                     buffer['tokens_window'].append(len(window) - (i+1)) |                     buffer['tokens_window'].append(len(window) - (i + 1)) | ||||||
| 
 | 
 | ||||||
|             writer.write(pa.table(buffer, schema=schema)) |             writer.write(pa.table(buffer, schema=schema)) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user