diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index f1086f4..a26c387 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -107,6 +107,7 @@ class WikiqTestCase(unittest.TestCase): try: tester.call_wikiq() + # tester.call_wikiq("--compute-incremental-diffs") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index da902be..f2cd9cd 100755 --- a/wikiq +++ b/wikiq @@ -17,6 +17,7 @@ from hashlib import sha1 from typing import Any, IO, TextIO, Generator, Union import mwxml +import requests from mwxml import Dump from deltas.tokenizers import wikitext_split @@ -34,6 +35,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pyarrow.csv as pacsv +DIFFS_URL = 'http://localhost:8000' class PersistMethod: none = 0 @@ -214,7 +216,10 @@ class WikiqParser: namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_parquet: bool = True, - parquet_buffer_size: int = 2000): + parquet_buffer_size: int = 2000, + compute_incremental_diffs: bool = False, + ): + """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -225,6 +230,7 @@ class WikiqParser: self.persist: int = persist self.namespaces = [] self.revert_radius = revert_radius + self.compute_incremental_diffs: bool = compute_incremental_diffs if namespaces is not None: self.namespace_filter = set(namespaces) @@ -360,6 +366,9 @@ class WikiqParser: schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True)) schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True)) + if self.compute_incremental_diffs: + schema = schema.append(pa.field('incremental diffs', pa.string())) + if self.output_parquet: writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') else: @@ -369,6 +378,8 @@ class WikiqParser: # Iterate through pages for page in dump: + incremental_diffs = [] + previous_text = "" # skip namespaces not in the filter if self.namespace_filter is not None: @@ -408,8 +419,35 @@ class WikiqParser: regex_matches[k] = [] regex_matches[k].append(v) + if self.compute_incremental_diffs: + payload = { + 'arg1': previous_text, + 'arg2': rev.text, + } + + try: + response = requests.post(DIFFS_URL, json=payload) + response.raise_for_status() + incremental_diffs.append(response.text) + except requests.exceptions.ConnectionError as e: + print( + f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") + print(e) + raise e + except requests.exceptions.HTTPError as e: + print(f"HTTP Error: {e}") + print(f"Response Body: {response.text}") + raise e + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + raise e + + previous_text = rev.text + # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() + if self.compute_incremental_diffs: + row_buffer['incremental diffs'] = incremental_diffs is_revert_column: list[Union[bool, None]] = [] for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): @@ -575,6 +613,10 @@ def main(): action='store_true', help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs", + action='store_true', + help="Compute and store incremental diffs by edit session.") + args = parser.parse_args() # set persistence method @@ -626,7 +668,9 @@ def main(): regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, - output_parquet=output_parquet) + output_parquet=output_parquet, + compute_incremental_diffs=args.compute_incremental_diffs, + ) wikiq.process()