Add noargs test for ikwiki

This way we can ensure that the parquet code outputs equivalent output.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason
2025-05-28 15:04:10 -05:00
parent d413443740
commit 52757a8239
3 changed files with 27806 additions and 7 deletions

13
wikiq
View File

@@ -14,7 +14,7 @@ from io import TextIOWrapper
from subprocess import Popen, PIPE
from collections import deque
from hashlib import sha1
from typing import Any, IO, TextIO, Iterable
from typing import Any, IO, TextIO
from mwxml import Dump
@@ -387,7 +387,7 @@ class WikiqParser:
namespaces: list[int] | None = None,
revert_radius: int = 15,
output_parquet: bool = True,
parquet_buffer_size: int=2000):
parquet_buffer_size: int = 2000):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@@ -427,8 +427,8 @@ class WikiqParser:
# make_dataclass is a function that defines a new dataclass type.
# here we extend the type we have already chosen and add the regular expression types
self.revdata_type: type = dc.make_dataclass('RevData_Parser',
fields=regex_fields,
bases=(revdata_type,))
fields=regex_fields,
bases=(revdata_type,))
# we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
@@ -544,11 +544,12 @@ class WikiqParser:
# Iterate through a page's revisions
for rev in page:
# create a new data object instead of a dictionary.
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
# create a new data object instead of a dictionary.
rev_data = self.revdata_type(revid=rev.id,
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid=page.id,
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
editorid=editorid,
title=page.title,
deleted=rev.deleted.text,
namespace=namespace