Add call to compute diffs via local PHP server
This is inefficient as it requires an individal request per diff. Going to try collecting the revision texts to reduce communication overhead. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
0d9ab003f0
commit
96915a074b
@ -107,6 +107,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
|
||||
try:
|
||||
tester.call_wikiq()
|
||||
# tester.call_wikiq("--compute-incremental-diffs")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
|
48
wikiq
48
wikiq
@ -17,6 +17,7 @@ from hashlib import sha1
|
||||
from typing import Any, IO, TextIO, Generator, Union
|
||||
|
||||
import mwxml
|
||||
import requests
|
||||
from mwxml import Dump
|
||||
|
||||
from deltas.tokenizers import wikitext_split
|
||||
@ -34,6 +35,7 @@ import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.csv as pacsv
|
||||
|
||||
DIFFS_URL = 'http://localhost:8000'
|
||||
|
||||
class PersistMethod:
|
||||
none = 0
|
||||
@ -214,7 +216,10 @@ class WikiqParser:
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
parquet_buffer_size: int = 2000):
|
||||
parquet_buffer_size: int = 2000,
|
||||
compute_incremental_diffs: bool = False,
|
||||
):
|
||||
|
||||
"""
|
||||
Parameters:
|
||||
persist : what persistence method to use. Takes a PersistMethod value
|
||||
@ -225,6 +230,7 @@ class WikiqParser:
|
||||
self.persist: int = persist
|
||||
self.namespaces = []
|
||||
self.revert_radius = revert_radius
|
||||
self.compute_incremental_diffs: bool = compute_incremental_diffs
|
||||
|
||||
if namespaces is not None:
|
||||
self.namespace_filter = set(namespaces)
|
||||
@ -360,6 +366,9 @@ class WikiqParser:
|
||||
schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True))
|
||||
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
|
||||
|
||||
if self.compute_incremental_diffs:
|
||||
schema = schema.append(pa.field('incremental diffs', pa.string()))
|
||||
|
||||
if self.output_parquet:
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||
else:
|
||||
@ -369,6 +378,8 @@ class WikiqParser:
|
||||
|
||||
# Iterate through pages
|
||||
for page in dump:
|
||||
incremental_diffs = []
|
||||
previous_text = ""
|
||||
|
||||
# skip namespaces not in the filter
|
||||
if self.namespace_filter is not None:
|
||||
@ -408,8 +419,35 @@ class WikiqParser:
|
||||
regex_matches[k] = []
|
||||
regex_matches[k].append(v)
|
||||
|
||||
if self.compute_incremental_diffs:
|
||||
payload = {
|
||||
'arg1': previous_text,
|
||||
'arg2': rev.text,
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(DIFFS_URL, json=payload)
|
||||
response.raise_for_status()
|
||||
incremental_diffs.append(response.text)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
print(
|
||||
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
|
||||
print(e)
|
||||
raise e
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"HTTP Error: {e}")
|
||||
print(f"Response Body: {response.text}")
|
||||
raise e
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
raise e
|
||||
|
||||
previous_text = rev.text
|
||||
|
||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||
row_buffer = table.pop()
|
||||
if self.compute_incremental_diffs:
|
||||
row_buffer['incremental diffs'] = incremental_diffs
|
||||
|
||||
is_revert_column: list[Union[bool, None]] = []
|
||||
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
|
||||
@ -575,6 +613,10 @@ def main():
|
||||
action='store_true',
|
||||
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||
|
||||
parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs",
|
||||
action='store_true',
|
||||
help="Compute and store incremental diffs by edit session.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set persistence method
|
||||
@ -626,7 +668,9 @@ def main():
|
||||
regex_revision_label=args.regex_revision_label,
|
||||
regex_match_comment=args.regex_match_comment,
|
||||
regex_comment_label=args.regex_comment_label,
|
||||
output_parquet=output_parquet)
|
||||
output_parquet=output_parquet,
|
||||
compute_incremental_diffs=args.compute_incremental_diffs,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user