1
0
pywikidiff2/pywikidiff2/pywikidiff2.cpp
2025-07-22 09:23:44 -07:00

271 lines
10 KiB
C++

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <vector>
#include "structmember.h"
#undef HAVE_CONFIG_H // this disables the php allocator
#include "../mediawiki-php-wikidiff2/src/lib/Wikidiff2.h"
#include "../mediawiki-php-wikidiff2/src/lib/Formatter.h"
#include "../mediawiki-php-wikidiff2/src/lib/TableFormatter.h"
#include "../mediawiki-php-wikidiff2/src/lib/InlineFormatter.h"
#include "../mediawiki-php-wikidiff2/src/lib/InlineJSONFormatter.h"
using wikidiff2::Wikidiff2;
using wikidiff2::TableFormatter;
using wikidiff2::Formatter;
//using wikidiff2::InlineFormatter;
using wikidiff2::InlineJSONFormatter;
// additional state (i.e., configuration) go here.
typedef struct {
PyObject_HEAD
Wikidiff2::Config config;
} pywikidiff2Obj;
// here's where we set default configuration values
static PyObject *
pywikidiff2_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
pywikidiff2Obj *self;
self = (pywikidiff2Obj *) type->tp_alloc(type, 0);
self->config.numContextLines = 15;
self->config.movedLineThreshold = 0.4;
self->config.changeThreshold = 0.2;
self->config.maxMovedLines = 100;
self->config.maxWordLevelDiffComplexity = 40000000;
self->config.maxSplitSize = 1;
self->config.initialSplitThreshold = 0.1;
self->config.finalSplitThreshold = 0.6;
return (PyObject *) self;
}
static int
pywikidiff2_init(pywikidiff2Obj *self, PyObject *args, PyObject *kwds)
{
// here's where we set non-default configuration values
static char *kwlist[] = {"numContextLines", "moved_line_threshold", "change_threshold", "moved_paragraph_detection_cutoff", "max_word_level_diff_complexity", "max_split_size", "initial_split_threshold", "final_split_threshold", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iffiiiff", kwlist,
&self->config.numContextLines,
&self->config.movedLineThreshold,
&self->config.changeThreshold,
&self->config.maxMovedLines,
&self->config.maxWordLevelDiffComplexity,
&self->config.maxSplitSize,
&self->config.initialSplitThreshold,
&self->config.finalSplitThreshold
)){
PyErr_SetString(PyExc_ImportError,"Error in configuration");
return -1;
}
return 0;
}
static PyObject *pywikidiff2_numContextLines(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyLong_FromLong(self->config.numContextLines);
}
static PyObject *pywikidiff2_movedLineThreshold(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyFloat_FromDouble(self->config.movedLineThreshold);
}
static PyObject *pywikidiff2_changeThreshold(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyFloat_FromDouble(self->config.changeThreshold);
}
static PyObject *pywikidiff2_maxMovedLines(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyLong_FromLong(self->config.maxMovedLines);
}
static PyObject *pywikidiff2_maxWordLevelDiffComplexity(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyLong_FromLong(self->config.maxWordLevelDiffComplexity);
}
static PyObject *pywikidiff2_maxSplitSize(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyLong_FromLong(self->config.maxSplitSize);
}
static PyObject *pywikidiff2_initialSplitThreshold(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyFloat_FromDouble(self->config.initialSplitThreshold);
}
static PyObject *pywikidiff2_finalSplitThreshold(pywikidiff2Obj *self, PyObject *Py_UNUSED(ignored)){
return PyFloat_FromDouble(self->config.finalSplitThreshold);
}
static Wikidiff2::String char_to_string(char* cstr){
Wikidiff2::String str(cstr, strlen(cstr));
return str;
}
static Wikidiff2::String wikidiff2_inline_json_diff(pywikidiff2Obj *self, char* text1, char* text2){
Wikidiff2::String str1(text1, strlen(text1));
Wikidiff2::String str2(text2, strlen(text2));
Wikidiff2 wikidiff2( *(&self->config));
InlineJSONFormatter formatter;
wikidiff2.addFormatter(formatter);
wikidiff2.execute(str1, str2);
Wikidiff2::String ret = formatter.getResult().str();
return ret;
}
static PyObject *pywikidiff2_inline_json_diff_sequence(pywikidiff2Obj *self, PyObject *args, PyObject *kwargs){
static char* kwdlist[] = {"texts", "numContextLines", NULL};
PyObject *py_list_obj;
Py_ssize_t list_size;
long numContextLines = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|$i",
kwdlist,
&py_list_obj,
&numContextLines
)){
PyErr_SetString(PyExc_ValueError, "Error in arguments to inline_json_many_diffs");
return NULL;
}
if (!PyList_Check(py_list_obj)) {
PyErr_SetString(PyExc_TypeError, "Expected a list argument.");
return NULL;
}
list_size = PyList_Size(py_list_obj);
if (list_size < 0) {
return NULL;
}
if(numContextLines != NULL){
self->config.numContextLines = numContextLines;
}
std::vector<const char*> input_texts_str((size_t) list_size+1);
input_texts_str[0] = "";
// convert the inputs and make the list to convert
Py_ssize_t i;
for(i = 0; i<list_size; ++i){
PyObject *py_text = PyList_GetItem(py_list_obj, i); // Borrowed reference
if (!PyUnicode_Check(py_text)) {
PyErr_Format(PyExc_TypeError, "List element at index %zd is not a string.", i);
return NULL;
}
Py_ssize_t text_size;
const char *c_text = PyUnicode_AsUTF8AndSize(py_text, &text_size);
if (c_text == NULL) {
return NULL;
}
input_texts_str[i+1] = c_text;
}
PyObject *result_list;
result_list = PyList_New(list_size);
char* last_text;
char* text;
size_t last_text_len;
size_t text_len;
for(i = 1; i<list_size+1; ++i){
if(i == 1){
last_text_len = strlen(input_texts_str[i-1]);
text_len = strlen(input_texts_str[i]);
last_text = new char[last_text_len];
text = new char[text_len];
strcpy(last_text, input_texts_str[i-1]);
strcpy(text, input_texts_str[i]);
} else {
delete last_text;
last_text = text;
last_text_len = text_len;
text_len = strlen(input_texts_str[i]);
text = new char[text_len];
strcpy(text, input_texts_str[i]);
}
Wikidiff2::String diff_str = wikidiff2_inline_json_diff(self, last_text, text);
PyObject* py_diff = PyUnicode_FromFormat("%s",diff_str.c_str());
PyList_SetItem(result_list, i-1, py_diff);
}
delete last_text;
delete text;
return result_list;
}
static PyObject *pywikidiff2_inline_json_diff(pywikidiff2Obj *self, PyObject *args, PyObject *kwargs){
static char* kwdlist[] = {"text1", "text2", "numContextLines", NULL};
char* text1;
char* text2;
long numContextLines = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ss|$i",
kwdlist,
&text1,
&text2,
&numContextLines
)){
PyErr_SetString(PyExc_ValueError, "Error in arguments to inline_json_diff");
return NULL;
}
if(numContextLines != NULL){
self->config.numContextLines = numContextLines;
}
Wikidiff2::String ret = wikidiff2_inline_json_diff(self, text1, text2);
return PyUnicode_FromFormat("%s",ret.c_str());
};
static PyMethodDef pywikidiff2_methods[] = {
{"inline_json_diff", (PyCFunction) pywikidiff2_inline_json_diff, METH_VARARGS | METH_KEYWORDS, "run wikidiff 2 on text1 and text2"},
{"inline_json_diff_sequence", (PyCFunction) pywikidiff2_inline_json_diff_sequence, METH_VARARGS | METH_KEYWORDS, "run wikidiff 2 on a series of texts"},
{"num_context_lines", (PyCFunction) pywikidiff2_numContextLines, METH_NOARGS, "number of equal lines to output in the context of a diff"},
{"moved_line_threshold", (PyCFunction) pywikidiff2_movedLineThreshold, METH_NOARGS, "The minimum similarity a pair of lines must have to be detected as a moved line. If present, this overrides php.ini wikidiff2.moved_line_threshold"},
{"change_threshold", (PyCFunction) pywikidiff2_changeThreshold, METH_NOARGS, "Changed lines with a similarity value below this threshold will be split into a deleted line and added line. This helps matching up moved lines in some cases."},
{"moved_paragraph_detection_cutoff", (PyCFunction) pywikidiff2_maxMovedLines, METH_NOARGS, "When the number of added and deleted lines in a table diff is greater than this limit, no attempt to detect moved lines will be made."},
{"max_moved_lines", (PyCFunction) pywikidiff2_maxMovedLines, METH_NOARGS, "When the number of added and deleted lines in a table diff is greater than this limit, no attempt to detect moved lines will be made."},
{"max_word_level_diff_complexity", (PyCFunction) pywikidiff2_maxWordLevelDiffComplexity, METH_NOARGS,
"When comparing two lines for changes within the line, a word-level diff will be done unless the product of the LHS word count and the RHS word count exceeds this limit."},
{"max_split_size", (PyCFunction) pywikidiff2_maxSplitSize, METH_NOARGS, "The maximum number of lines in $text2 which may be considered for a word-level diff against a single line of $text1. Default: 1."},
{"initial_split_threshold", (PyCFunction) pywikidiff2_initialSplitThreshold, METH_NOARGS, "The minimum similarity which must be maintained during a split detection search. The search terminates when the similarity falls below this level. Default: 0.1."},
{"final_split_threshold", (PyCFunction) pywikidiff2_finalSplitThreshold, METH_NOARGS, "The minimum similarity which must be achieved in order to display the comparison between one line and several lines as a split. Default 0.6."},
{NULL} /* Sentinel */
};
static PyTypeObject pywikidiff2Type = {
.ob_base = PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "pywikidiff2.pywikidiff2",
.tp_basicsize = sizeof(pywikidiff2Obj),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT,
.tp_doc = PyDoc_STR("Create an object that can run wikidiff2"),
.tp_methods = pywikidiff2_methods,
// .tp_members = pywikidiff2_members,
.tp_init = (initproc) pywikidiff2_init,
.tp_new = pywikidiff2_new,
};
static struct PyModuleDef pywikidiff2 = {
PyModuleDef_HEAD_INIT,
.m_name = "pywikidiff2",
.m_size=0,
};
// the args are: configuration dictionary, text_1, text_2
PyMODINIT_FUNC PyInit_pywikidiff2(void)
{
PyObject *m;
if (PyType_Ready(&pywikidiff2Type) < 0){
PyErr_SetString(PyExc_ImportError,"not pytype ready");
return NULL;
}
m = PyModule_Create(&pywikidiff2);
if (m == NULL){
PyErr_SetString(PyExc_ImportError, "module null");
return NULL;
}
Py_INCREF(&pywikidiff2Type);
if (PyModule_AddObject(m, "pywikidiff2", (PyObject *) &pywikidiff2Type) < 0) {
Py_DECREF(&pywikidiff2Type);
Py_DECREF(m);
PyErr_SetString(PyExc_ImportError,"Could not add object");
return NULL;
}
return m;
}