From a719998e0b110d054fa1969c152d4a5d5fe418ae Mon Sep 17 00:00:00 2001 From: Aleksandr Kovalko Date: Mon, 11 May 2026 19:06:27 +0200 Subject: [PATCH] Route calloc through PyObject_Calloc in C tokenizer The C tokenizer's common.h redefines malloc/realloc/free to the PyObject_* family but left calloc pointing at libc. Entity parsing in Tokenizer_really_parse_entity allocates with calloc and later releases the buffer with free (= PyObject_Free), which is undefined behavior: under PYTHONMALLOC=debug it aborts in _PyMem_DebugRawFree, and otherwise it can silently corrupt the heap. Route calloc through PyObject_Calloc alongside the other allocator macros so both calloc sites match the surrounding free. Fixes #352. --- .../parser/ctokenizer/common.h | 1 + tests/test_tokenizer.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/mwparserfromhell/parser/ctokenizer/common.h b/src/mwparserfromhell/parser/ctokenizer/common.h index dbd13b68..e04472b9 100644 --- a/src/mwparserfromhell/parser/ctokenizer/common.h +++ b/src/mwparserfromhell/parser/ctokenizer/common.h @@ -39,6 +39,7 @@ SOFTWARE. #endif #define malloc PyObject_Malloc // XXX: yuck +#define calloc PyObject_Calloc #define realloc PyObject_Realloc #define free PyObject_Free diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5864b48c..ab38d2e6 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -21,6 +21,9 @@ from __future__ import annotations import os +import subprocess +import sys +import textwrap from collections.abc import Generator from dataclasses import dataclass @@ -145,3 +148,31 @@ def test_describe_context(): assert "" == contexts.describe(0) ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT) assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx + + +@pytest.mark.skipif(CTokenizer is None, reason="CTokenizer not available") +def test_entity_does_not_corrupt_heap(): + """Regression test: the C tokenizer must not mix raw libc calloc with + PyObject_Free when handling an ampersand that is not a valid entity. + + Run in a subprocess with PYTHONMALLOC=debug so any allocator mismatch on + the entity-parsing path is reported as a fatal error rather than silent + heap corruption. + """ + program = textwrap.dedent( + """ + import mwparserfromhell + from mwparserfromhell.parser._tokenizer import CTokenizer + assert isinstance(mwparserfromhell.parser.Parser()._tokenizer, CTokenizer) + for text in ("a & b", "{{T|p=a & b}}", "&", "*", "*"): + assert str(mwparserfromhell.parse(text)) == text + """ + ) + env = {**os.environ, "PYTHONMALLOC": "debug"} + result = subprocess.run( + [sys.executable, "-c", program], env=env, capture_output=True, text=True + ) + assert result.returncode == 0, ( + "C tokenizer triggered allocator mismatch under PYTHONMALLOC=debug:\n" + f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}" + )