diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f07d4ac5..1b146e162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ set(_tvm_ffi_objs_sources "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/dtype.cc" "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/container.cc" "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/init_once.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/src/ffi/custom_allocator.cc" ) set(_tvm_ffi_extra_objs_sources @@ -272,20 +273,11 @@ if (TVM_FFI_BUILD_PYTHON_MODULE) VERBATIM ) - if (Python_VERSION VERSION_GREATER_EQUAL "3.12" AND NOT PYTHON_IS_FREE_THREADED) - # >= Python3.12, use Use_SABI version - python_add_library(tvm_ffi_cython MODULE "${_core_cpp}" USE_SABI 3.12) - target_link_libraries(tvm_ffi_cython PRIVATE Python::SABIModule) - set_target_properties(tvm_ffi_cython PROPERTIES OUTPUT_NAME "core") - if (NOT WIN32) - target_link_libraries(tvm_ffi_cython PRIVATE Python::Module) - set_target_properties(tvm_ffi_cython PROPERTIES SUFFIX ".abi3.so") - endif () - else () - # before Python3.12, use WITH_SOABI version - python_add_library(tvm_ffi_cython MODULE "${_core_cpp}" WITH_SOABI) - set_target_properties(tvm_ffi_cython PROPERTIES OUTPUT_NAME "core") - endif () + # The PyObject-tying impl in tvm_ffi_python_helpers.h uses full Python C API (Py_IncRef, + # PyObject_GC_Del, atomic header reads), so we build against the per-version ABI rather than the + # limited (abi3) ABI. + python_add_library(tvm_ffi_cython MODULE "${_core_cpp}" WITH_SOABI) + set_target_properties(tvm_ffi_cython PROPERTIES OUTPUT_NAME "core") target_include_directories( tvm_ffi_cython PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm_ffi/cython ) diff --git a/include/tvm/ffi/c_api.h b/include/tvm/ffi/c_api.h index 52b6337e3..c1d53ba26 100644 --- a/include/tvm/ffi/c_api.h +++ b/include/tvm/ffi/c_api.h @@ -580,6 +580,66 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj); TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index, void (*deleter)(void* handle), TVMFFIObjectHandle* out); +//----------------------------------------------------------------------- +// Section: ObjectAllocHeader and CustomAllocator +//----------------------------------------------------------------------- +/*! + * \brief Mandatory header placed immediately before each TVMFFIObject body. + * + * This header may be used by TVMFFIObject::deleter to reclaim space when a + * custom allocator is present. It can also be set to NULL if + * TVMFFIObject::deleter directly calls system free. This section must be + * available for each Object so a frontend can rely on this field to confirm + * if the object came from a certain allocator. + */ +typedef struct { + /*! + * \brief Free the allocation. + * \param ptr The pointer to the space of the object. + * \note ``ptr`` points to the space of TVMFFIObject and does not include + * the TVMFFIObjectAllocHeader. + */ + void (*delete_space)(void* ptr); +} TVMFFIObjectAllocHeader; + +/*! + * \brief Custom allocator entry registered with TVMFFISetCustomAllocator. + */ +typedef struct { + /*! + * \brief Allocate the space for an Object body. + * \param size The size requested for the object body. + * \param alignment The alignment requirement for the object body. + * \param type_index Type index of the object. + * \param context The ``context`` field of the registered allocator. + * \return Pointer to the space of the object, or NULL on failure (with + * the error reported via ``TVMFFIErrorSetRaised``). + * \note The returned pointer must be preceded by a + * ``TVMFFIObjectAllocHeader`` whose ``delete_space`` releases the + * full underlying allocation when invoked. + */ + void* (*allocate)(size_t size, size_t alignment, int32_t type_index, void* context); + /*! \brief Allocator context passed unmodified to ``allocate``. */ + void* context; +} TVMFFICustomAllocator; + +/*! + * \brief Get the process-wide custom allocator. + * \return The currently registered allocator (never NULL). + * \note ``TVMFFIGetCustomAllocator`` always returns a valid allocator and + * can be overridden by ``TVMFFISetCustomAllocator``. + */ +TVM_FFI_DLL TVMFFICustomAllocator* TVMFFIGetCustomAllocator(void); + +/*! + * \brief Register the process-wide custom allocator. + * \param allocator Pointer to a TVMFFICustomAllocator, or NULL to restore + * the builtin default. + * \return 0 on success, nonzero on failure. + * \note ``allocator`` must be alive throughout the lifetime of the process. + */ +TVM_FFI_DLL int TVMFFISetCustomAllocator(TVMFFICustomAllocator* allocator); + /*! * \brief Convert type key to type index. * \param type_key The key of the type. diff --git a/include/tvm/ffi/memory.h b/include/tvm/ffi/memory.h index b4fabed5e..ca823a549 100644 --- a/include/tvm/ffi/memory.h +++ b/include/tvm/ffi/memory.h @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -79,6 +80,39 @@ TVM_FFI_INLINE void* AlignedAlloc(size_t size) { #endif } +/*! + * \brief Allocate aligned memory with a runtime-known alignment. + * + * Sibling of the templated ``AlignedAlloc`` for callers that + * receive ``align`` as a parameter (e.g. custom-allocator + * implementations dispatching on ``TVMFFICustomAllocator::allocate``'s + * runtime ``alignment`` argument). + * + * \param size The size. + * \param align The alignment, must be a power of 2. + * \return The pointer to the allocated memory. + */ +TVM_FFI_INLINE void* AlignedAllocRuntime(size_t size, size_t align) { +#ifdef _MSC_VER + if (void* ptr = _aligned_malloc(size, align)) { + return ptr; + } + throw std::bad_alloc(); +#else + if (align <= alignof(std::max_align_t)) { + if (void* ptr = std::malloc(size)) { + return ptr; + } + throw std::bad_alloc(); + } + void* ptr; + if (posix_memalign(&ptr, align, size) != 0) { + throw std::bad_alloc(); + } + return ptr; +#endif +} + /*! * \brief Free aligned memory. * \param data The pointer to the memory to free. @@ -168,7 +202,11 @@ class SimpleObjAllocator : public ObjAllocatorBase { // class with non-virtual destructor. // We are fine here as we captured the right deleter during construction. // This is also the right way to get storage type for an object pool. - void* data = AlignedAlloc(sizeof(T)); + static_assert(alignof(T) <= alignof(::std::max_align_t), + "Object types with alignment > max_align_t are not supported " + "by the custom allocator hook"); + TVMFFICustomAllocator* alloc = TVMFFIGetCustomAllocator(); + void* data = alloc->allocate(sizeof(T), alignof(T), T::RuntimeTypeIndex(), alloc->context); new (data) T(std::forward(args)...); return reinterpret_cast(data); } @@ -187,7 +225,8 @@ class SimpleObjAllocator : public ObjAllocatorBase { tptr->T::~T(); } if (flags & kTVMFFIObjectDeleterFlagBitMaskWeak) { - AlignedFree(static_cast(tptr)); + ObjectUnsafe::GetObjectAllocHeaderFromPtr(static_cast(tptr)) + ->delete_space(static_cast(tptr)); } } }; @@ -215,12 +254,17 @@ class SimpleObjAllocator : public ObjAllocatorBase { static_assert( alignof(ArrayType) % alignof(ElemType) == 0 && sizeof(ArrayType) % alignof(ElemType) == 0, "element alignment constraint"); + static_assert(alignof(ArrayType) <= alignof(::std::max_align_t), + "Object types with alignment > max_align_t are not supported " + "by the custom allocator hook"); size_t size = sizeof(ArrayType) + sizeof(ElemType) * num_elems; // round up to the nearest multiple of align constexpr size_t align = alignof(ArrayType); // C++ standard always guarantees that alignof operator returns a power of 2 size_t aligned_size = (size + (align - 1)) & ~(align - 1); - void* data = AlignedAlloc(aligned_size); + TVMFFICustomAllocator* alloc = TVMFFIGetCustomAllocator(); + void* data = + alloc->allocate(aligned_size, align, ArrayType::RuntimeTypeIndex(), alloc->context); new (data) ArrayType(std::forward(args)...); return reinterpret_cast(data); } @@ -239,7 +283,8 @@ class SimpleObjAllocator : public ObjAllocatorBase { tptr->ArrayType::~ArrayType(); } if (flags & kTVMFFIObjectDeleterFlagBitMaskWeak) { - AlignedFree(static_cast(tptr)); + ObjectUnsafe::GetObjectAllocHeaderFromPtr(static_cast(tptr)) + ->delete_space(static_cast(tptr)); } } }; diff --git a/include/tvm/ffi/object.h b/include/tvm/ffi/object.h index 9b22636ba..c44b4a018 100644 --- a/include/tvm/ffi/object.h +++ b/include/tvm/ffi/object.h @@ -1096,6 +1096,16 @@ struct ObjectUnsafe { return const_cast(&(src->header_)); } + /*! + * \brief Recover the TVMFFIObjectAllocHeader for a TVMFFIObject pointer. + * \param ptr The pointer to the space of the object. + * \return The header set by the allocator that produced ``ptr``. + */ + TVM_FFI_INLINE static TVMFFIObjectAllocHeader* GetObjectAllocHeaderFromPtr(void* ptr) { + return reinterpret_cast(static_cast(ptr) - + sizeof(TVMFFIObjectAllocHeader)); + } + // Suppress -Winvalid-offsetof: we intentionally use offsetof on non-standard-layout types // to avoid undefined behavior from null pointer arithmetic that sanitizers flag. #if defined(__clang__) || defined(__GNUC__) diff --git a/pyproject.toml b/pyproject.toml index 8048be224..a5a515430 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -232,14 +232,19 @@ docstring-code-line-length = 80 [tool.cibuildwheel] build-verbosity = 1 -# only build up to cp312, cp312 -# will be abi3 and can be used in future versions -# ship 314t threaded nogil version -build = ["cp38-*", "cp39-*", "cp310-*", "cp311-*", "cp312-*", "cp314t-*"] +# Per-Python-version wheels (no abi3 / limited API). +build = [ + "cp38-*", + "cp39-*", + "cp310-*", + "cp311-*", + "cp312-*", + "cp313-*", + "cp314t-*", +] skip = ["*musllinux*"] # we only need to test on cp312 test-skip = ["cp38-*", "cp39-*", "cp310-*", "cp311-*"] -# focus on testing abi3 wheel build-frontend = "build[uv]" test-command = "pytest {package}/tests/python -vvs" test-groups = ["test"] diff --git a/python/tvm_ffi/cython/base.pxi b/python/tvm_ffi/cython/base.pxi index a2492077e..37608f774 100644 --- a/python/tvm_ffi/cython/base.pxi +++ b/python/tvm_ffi/cython/base.pxi @@ -361,6 +361,17 @@ def _env_get_current_stream(int device_type, int device_id): cdef extern from "tvm_ffi_python_helpers.h": + int TVMFFIPyRegisterDefaultAllocator() noexcept + void TVMFFIPyMarkPythonFinalizing() noexcept + + bint TVMFFIPyTryGetAttachedPyObject(void* chandle, PyObject** out) noexcept + bint TVMFFIPyIsDetached(void* chandle) noexcept + void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) noexcept + void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) noexcept + void TVMFFIPyTpDealloc(void** ptr_to_chandle, PyObject* wrapper) noexcept + void TVMFFIPySetReviveBlock(PyObject* cached_alloc) noexcept + void TVMFFIPyInstallTypeSlots(PyObject* type_obj) noexcept + # no need to expose fields of the call context setter data structure ctypedef struct TVMFFIPyCallContext: int device_type @@ -542,5 +553,11 @@ cdef _init_env_api(): _init_env_api() + +CHECK_CALL(TVMFFIPyRegisterDefaultAllocator()) + +import atexit as _tvm_ffi_atexit +_tvm_ffi_atexit.register(TVMFFIPyMarkPythonFinalizing) + # ensure testing is linked and we can run testcases TVMFFITestingDummyTarget() diff --git a/python/tvm_ffi/cython/function.pxi b/python/tvm_ffi/cython/function.pxi index f613fda52..c7af404a7 100644 --- a/python/tvm_ffi/cython/function.pxi +++ b/python/tvm_ffi/cython/function.pxi @@ -545,9 +545,19 @@ cdef int TVMFFIPyArgSetterObjectRValueRef_( PyObject* py_arg, TVMFFIAny* out ) except -1: """Setter for ObjectRValueRef""" - cdef object arg = py_arg + cdef CObject src = (py_arg).obj + # need to detach from chandle + # there are two possible outcomes after the call: + # chandle gets moved, so it is set to NULL + # callee did not move chandle, in such case src.chandle is valid + # but chandle is no longer attached to PyObject + # we need to carefully handle chandle and PyObject recycling in both cases. + # These logics are implemented in TVMFFIPyTpDealloc (CObject.__dealloc__). + # NOTE: TVMFFIPyDetachPyObject is robust to cases where the Object is not + # allocated by the custom Python allocator. + TVMFFIPyDetachPyObject(src.chandle, src) out.type_index = kTVMFFIObjectRValueRef - out.v_ptr = &(((arg.obj)).chandle) + out.v_ptr = &(src.chandle) return 0 @@ -1089,6 +1099,7 @@ def _register_global_func(name: str, pyfunc: Callable[..., Any] | Function, over def _get_global_func(name: str, allow_missing: bool): + # PyObject tying is not applicable here. cdef TVMFFIObjectHandle chandle cdef ByteArrayArg name_arg = ByteArrayArg(c_str(name)) diff --git a/python/tvm_ffi/cython/object.pxi b/python/tvm_ffi/cython/object.pxi index c564b42f6..fe09bac5c 100644 --- a/python/tvm_ffi/cython/object.pxi +++ b/python/tvm_ffi/cython/object.pxi @@ -103,9 +103,8 @@ cdef class CObject: self.chandle = NULL def __dealloc__(self): - if self.chandle != NULL: - CHECK_CALL(TVMFFIObjectDecRef(self.chandle)) - self.chandle = NULL + # Release the wrapper's +1 on the chandle and inactivate-or-free its allocation. + TVMFFIPyTpDealloc(&self.chandle, self) def __ctypes_handle__(self) -> object: return ctypes_handle(self.chandle) @@ -173,8 +172,12 @@ cdef class CObject: return isinstance(other, CObject) and self.chandle == (other).chandle def __move_handle_from__(self, other: CObject) -> None: - self.chandle = (other).chandle + cdef void* chandle = (other).chandle + TVMFFIPyDetachPyObject(chandle, other) + self.chandle = chandle (other).chandle = NULL + if TVMFFIPyIsDetached(chandle): + TVMFFIPyAttachPyObject(chandle, self) def __init_handle_by_constructor__(self, fconstructor: Any, *args: Any) -> None: # avoid error raised during construction. @@ -183,6 +186,8 @@ cdef class CObject: ConstructorCall( (fconstructor).chandle, args, &chandle, NULL) self.chandle = chandle + if TVMFFIPyIsDetached(chandle): + TVMFFIPyAttachPyObject(chandle, self) cdef class CContainerBase(CObject): @@ -432,6 +437,19 @@ cdef inline object make_ret_opaque_object(TVMFFIAny result): (obj).chandle = result.v_obj return obj.pyobject() +cdef public void** TVMFFICyObjectGetCHandlePtr(PyObject* ptr) noexcept: + # Address of a CObject wrapper's ``chandle`` field, for the C++ helpers. + return &((ptr).chandle) + + +cdef inline void _install_pyobject_tying_slots(object type_cls): + """Install the custom tp_alloc / tp_free PyObject-tying slots on a + registered FFI type. No-op unless ``type_cls`` is a ``CObject`` subclass. + """ + if type_cls is not None and isinstance(type_cls, type) and issubclass(type_cls, CObject): + TVMFFIPyInstallTypeSlots(type_cls) + + cdef inline object make_fallback_cls_for_type_index(int32_t type_index): cdef str type_key = _type_index_to_key(type_index) cdef object type_info = _lookup_or_register_type_info_from_type_key(type_key) @@ -469,22 +487,44 @@ cdef inline object make_fallback_cls_for_type_index(int32_t type_index): cdef inline object make_ret_object(TVMFFIAny result): - cdef int32_t type_index - cdef object cls, obj - type_index = result.type_index + """Wrap a returned chandle into its canonical Python wrapper. + + Caller must own +1 on ``result.v_obj``; ownership transfers to the + returned wrapper. This is the cache-&-revive dispatcher. + """ + cdef int32_t type_index = result.type_index + cdef object cls, obj, cached + cdef PyObject* attached_ptr if type_index < len(TYPE_INDEX_TO_CLS) and (cls := TYPE_INDEX_TO_CLS[type_index]) is not None: if issubclass(cls, PyNativeObject): + # Value-typed: the transient Object wrapper is discarded after + # __from_tvm_ffi_object__. No identity stability needed. obj = Object.__new__(Object) (obj).chandle = result.v_obj return cls.__from_tvm_ffi_object__(cls, obj) + # ``attached_ptr`` is the usable (untagged) wrapper; the bool says Active. + if TVMFFIPyTryGetAttachedPyObject(result.v_obj, &attached_ptr): + # LIVE HIT: cached wrapper is alive and already owns a +1; + # drop the caller's redundant +1 and return it. + cached = attached_ptr + CHECK_CALL(TVMFFIObjectDecRef(result.v_obj)) + return cached + if attached_ptr != NULL: + # REVIVAL: revive the inactive cached allocation in place so ``id()`` stays + # stable; tp_alloc (via cls.__new__) reinitializes it there. + TVMFFIPySetReviveBlock(attached_ptr) + obj = cls.__new__(cls) + # Defensive: clear the slot in case cls.__new__ bypassed tp_alloc. + TVMFFIPySetReviveBlock(NULL) + (obj).chandle = result.v_obj + TVMFFIPyAttachPyObject(result.v_obj, obj) + return obj else: - # Slow path: object is not found in registered entry - # In this case create a dummy stub class for future usage. - # For every unregistered class, this slow path will be triggered only once. cls = make_fallback_cls_for_type_index(type_index) obj = cls.__new__(cls) (obj).chandle = result.v_obj + TVMFFIPyAttachPyObject(result.v_obj, obj) return obj @@ -611,6 +651,12 @@ cdef _update_registry(int type_index, object type_key, object type_info, object TYPE_KEY_TO_INFO[type_key] = type_info if type_cls is not None: TYPE_CLS_TO_INFO[type_cls] = type_info + # Universal choke point for every registered FFI type (core + # _register_object_by_index, dynamic _register_py_class, and + # make_fallback_cls_for_type_index all funnel here): install the custom + # tp_alloc / tp_free that drive PyObject-tying. tp_alloc / tp_free are + # not inherited by dynamic subtypes, so each type must be patched here. + _install_pyobject_tying_slots(type_cls) def _register_object_by_index(int type_index, object type_cls): diff --git a/python/tvm_ffi/cython/pycallback.pxi b/python/tvm_ffi/cython/pycallback.pxi index 59dfdb6fc..27fd90686 100644 --- a/python/tvm_ffi/cython/pycallback.pxi +++ b/python/tvm_ffi/cython/pycallback.pxi @@ -74,7 +74,13 @@ cdef int TVMFFIPyCallbackArgSetterObject_( const TVMFFIAny* arg, PyObject** out ) except -1: - """Callback arg setter for generic static object types (type_index >= kTVMFFIStaticObjectBegin).""" + """Callback arg setter for generic static object types (type_index >= kTVMFFIStaticObjectBegin). + + Funnels through ``make_ret_object`` so the callback receives the + canonical wrapper for the chandle. When the caller already has a + wrapper for this chandle, the callback's arg is the same Python + object — universal cache-on aliasing. + """ cdef TVMFFIObjectHandle chandle = arg.v_ptr TVMFFIObjectIncRef(chandle) try: @@ -228,6 +234,10 @@ cdef int TVMFFIPyCallbackArgSetterRValueRef_( if actual_type_index == kTVMFFITensor: obj = make_tensor_from_chandle(chandle, api) else: + # The rvalue setter on the caller side already eager-detached + # the source wrapper's binding via TVMFFIPyDetachPyObject, so + # this chandle is Init — make_ret_object allocates a fresh + # canonical wrapper and Attaches it. obj = make_ret_object(synthesized) if api != NULL and isinstance(obj, CContainerBase): (obj)._dlpack_exchange_api = api diff --git a/python/tvm_ffi/cython/tensor.pxi b/python/tvm_ffi/cython/tensor.pxi index dcf8e19c5..d7c09c453 100644 --- a/python/tvm_ffi/cython/tensor.pxi +++ b/python/tvm_ffi/cython/tensor.pxi @@ -497,7 +497,11 @@ cdef inline object make_tensor_from_chandle( # call the deleter to free the memory since we will continue to use the chandle dlpack.deleter(dlpack) pass - # default return the tensor + # default return the tensor. + # NOTE: we don't TVMFFIPyAttachPyObject here — the same chandle may be + # wrapped multiple times via this factory (e.g. once per arg setter + # callback) and attach-then-overwrite would corrupt the tying cache. + # Tensors returned via the FFI go through make_ret_object instead. tensor = _CLASS_TENSOR.__new__(_CLASS_TENSOR) (tensor).chandle = chandle (tensor).cdltensor = TVMFFITensorGetDLTensorPtr(chandle) diff --git a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h index c82b09ce4..dd466f360 100644 --- a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h +++ b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h @@ -26,6 +26,7 @@ #include #include #include +#include // Define here to avoid dependencies on non-c headers for now #ifndef TVM_FFI_INLINE @@ -59,11 +60,22 @@ #endif #endif +// Managed-dict (`__slots__ = ("__dict__",)` without an explicit dictoffset) +// is a CPython 3.11+ feature. On 3.9/3.10 such types instead use a regular +// ``tp_dictoffset != 0``, which the inactive-eligibility check catches anyway, +// so defining the flag as 0 here yields the correct (no-op) behavior. +#ifndef Py_TPFLAGS_MANAGED_DICT +#define Py_TPFLAGS_MANAGED_DICT 0 +#endif + +#include +#include #include #include #include #include #include +#include #include ///-------------------------------------------------------------------------------- @@ -71,6 +83,577 @@ // prefixed with TVMFFIPy so they can be easily invoked through Cython. ///-------------------------------------------------------------------------------- +//================================================================================ +// PyObject-tying state machine. +// +// Ties one Python wrapper to one C++ chandle so that +// - ``a.x is a.x`` while the wrapper is live; +// - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep +// the chandle alive); +// - ``f(x) is x`` whenever an FFI function returns a chandle that already +// has a canonical wrapper. +// +// Layout +// ------ +// Every Object allocated through the registered Python allocator +// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``: +// +// malloc start +// +-------------------+--------------------------+--------+ +// | tagged_pyobj | TVMFFIObjectAllocHeader | T | +// | (offset 0..8) | delete_space (8..16) | | +// +-------------------+--------------------------+--------+ +// ^ ptr = malloc + 16 +// +// ``tagged_pyobj`` is a tagged pointer to the canonical Python wrapper. The +// wrapper is >= 16-aligned, so its low bit is free to encode the state (see +// below) without growing the header past its fixed 16 bytes. +// +// States +// ------ +// Detached: ``tagged_pyobj == NULL`` -- no wrapper bound to this chandle. +// Active: ``(tagged_pyobj & 1) == 0`` -- the live canonical wrapper. +// Inactive: ``(tagged_pyobj & 1) == 1`` -- dead, untracked allocation cached +// for address-stable revival. +// +// Invariants +// ---------- +// I1. When a PyObject goes out of scope (no Python var refers to it), its +// +1 on chandle is always released (in ``__dealloc__`` -> +// ``TVMFFIPyTpDealloc``). +// I2. When a chandle is destroyed, its cached allocation (if any) is +// reclaimed. +// I3. A PyObject whose ``chandle != NULL && != DEALLOC_TRANSIT`` owns +// +1 on that chandle, so any reader that follows such a ``chandle`` +// observes a live object. ``DEALLOC_TRANSIT`` is set only transiently +// while a wrapper deallocs -- armed in ``TVMFFIPyTpDealloc``, cleared +// before the dealloc completes -- so no reader outside that window ever +// observes one. +// I4. Every ``PyObject*`` the Cython side passes to a helper here is a +// live wrapper (tag bit 0), never a value read back from +// ``tagged_pyobj``. Only this header ever sets or clears the tag. +// +// The dealloc handshake +// --------------------- +// When an eligible wrapper deallocs, its allocation should be *cached Inactive* +// (kept for an address-stable revival) if the chandle outlives it, or +// *genuinely freed* if the wrapper held the chandle's last reference. Reading +// the chandle's refcount to decide would race a concurrent C++-side ``DecRef`` +// (FFI calls release the GIL), so ``TVMFFIPyTpDealloc`` does not read it. +// +// Instead it arms a ``DEALLOC_TRANSIT`` marker on ``wrapper.chandle`` and +// ``DecRef``s unconditionally; whether the chandle's deleter fires during that +// ``DecRef`` decides the outcome. The marker lets the three dealloc callbacks +// coordinate: +// +// Case 0 (chandle outlives us): the deleter does not fire; ``tp_free`` sees +// the marker and caches the allocation Inactive. +// Case 1 (wrapper held the last ref): the deleter (``TVMFFIPyDeleteSpace``) +// fires inside the ``DecRef`` and hands the genuine free back to ``tp_free``. +// Case 2 (cached allocation's chandle dies later): the deleter reclaims the +// allocation directly (``PyObject_GC_Del``). +// +// Where transitions happen +// ------------------------ +// ``make_ret_object`` (object.pxi) -- the dispatcher; the only frame that +// holds the chandle->wrapper mapping: +// Detached -> Active : fresh wrapper; take the caller's +1, then attach. +// Active -> Active : return the cached wrapper as-is; drop the +// caller's redundant +1. +// Inactive -> Active : revive the cached allocation in place (stable +// ``id()``); take the caller's +1, re-attach. +// +// ``TVMFFIPyTpDealloc`` (CObject.__dealloc__) -- runs when the wrapper's +// refcount hits 0, before the free: +// Active -> Inactive : type is inactive-eligible; pre-tag Inactive, arm +// the DEALLOC_TRANSIT marker, DecRef. ``tp_free`` / +// ``TVMFFIPyDeleteSpace`` finish the handshake (the +// allocation is cached iff the chandle survived the DecRef). +// Active -> Detached : type not eligible; detach first, then DecRef. +// +// ``TVMFFIPyArgSetterObjectRValueRef_`` (function.pxi), +// ``__move_handle_from__`` (object.pxi): +// Active -> Detached : detach the binding before a move nulls the +// source chandle. +// +// ``TVMFFIPyDeleteSpace`` (Weak deleter) -- the chandle's refcount hit 0: +// Inactive -> (gone) : reclaim the cached allocation (``PyObject_GC_Del``) +// and free the chandle's block -- unless our own +// dealloc drove this free, when the DEALLOC_TRANSIT +// marker defers the allocation free to ``tp_free``. +// +// Slot install +// ------------ +// ``tp_alloc`` / ``tp_free`` are NOT inherited by dynamic subtypes (CPython +// resets them per dynamic subtype), so each registered type needs its own +// install. ``_update_registry`` (object.pxi) -- the choke point every +// registered FFI type funnels through -- calls ``TVMFFIPyInstallTypeSlots`` +// there, once per type. +// +// Shutdown guard +// -------------- +// ``TVMFFIPyMarkPythonFinalizing`` is wired to atexit from Cython module +// init. After it fires, inactive cached allocations on still-live chandles are +// intentionally leaked (process exiting; OS reclaims) rather than reaching +// for ``PyGILState_Ensure`` on a teardown interpreter. +// +// Free-threaded builds (``Py_GIL_DISABLED``) +// ------------------------------------------ +// The state machine above is GIL-serialized: it reads and mutates +// ``tagged_pyobj`` with no synchronization, so without the GIL the Active-hit +// borrowed read races into a use-after-free. The feature is therefore disabled +// wholesale on free-threaded builds at its two one-time setup gates +// (``TVMFFIPyRegisterDefaultAllocator`` / ``TVMFFIPyInstallTypeSlots``, both +// ``#ifdef``'d below). With neither the custom allocator nor the custom slots +// installed, every chandle is non-canonical, so all helpers take their existing +// no-op tail: ``make_ret_object`` returns a fresh wrapper and ``__dealloc__`` +// just releases its +1 -- the FT-safe behavior from before this feature, minus +// identity stickiness (``a.x is a.x`` / stable ``id()``). The GIL build keeps +// the gates active and is unchanged. +//================================================================================ + +/*! + * \brief Python-side derived header. ``base.delete_space`` sits at + * ``ptr - sizeof(TVMFFIObjectAllocHeader)`` so the generic C++ + * deleter (which knows nothing about Python) can find it. + */ +struct PyCustomAllocHeader { + PyObject* tagged_pyobj; + TVMFFIObjectAllocHeader base; +}; + +static_assert(sizeof(PyCustomAllocHeader) == 16, + "header must be 16 bytes so T at ptr = malloc + 16 is naturally " + "aligned for alignof(T) up to alignof(max_align_t)"); +static_assert(offsetof(PyCustomAllocHeader, base) == + sizeof(PyCustomAllocHeader) - sizeof(TVMFFIObjectAllocHeader), + "base must sit at ptr - sizeof(TVMFFIObjectAllocHeader) for the " + "C++ deleter to find it"); + +TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* ptr) { + return reinterpret_cast(static_cast(ptr) - + sizeof(PyCustomAllocHeader)); +} + +constexpr uintptr_t kPyCachedInactiveTagBit = 1; + +TVM_FFI_INLINE bool TVMFFIPyTagIsInactive(PyObject* tagged) { + return (reinterpret_cast(tagged) & kPyCachedInactiveTagBit) != 0; +} +TVM_FFI_INLINE PyObject* TVMFFIPyRemoveTag(PyObject* tagged) { + return reinterpret_cast(reinterpret_cast(tagged) & + ~kPyCachedInactiveTagBit); +} +// The two shared queries above (Inactive predicate, tag removal) stay helpers. +// There is no Active tagger -- a live wrapper pointer already has tag bit 0 -- +// and the sole tag-set (Active -> Inactive) has one call site, inlined in +// TVMFFIPyTpDealloc. + +/*! + * \brief Per-thread vehicle carrying the inactive cached allocation address from + * ``make_ret_object`` (which knows the chandle) down into + * ``TVMFFIPyTpAlloc`` (which is handed only ``type`` and an item count). + * + * The slot's sole access primitive is a swap: store ``next``, return the prior + * value. Taking the block is thus ``TVMFFIPyTLSReviveSlot(nullptr)`` -- + * read-and-clear in one step, with no separate clear to forget. Per-thread -> + * free-threading safe. + */ +inline PyObject* TVMFFIPyTLSReviveSlot(PyObject* next) { + static thread_local PyObject* slot = nullptr; + std::swap(slot, next); + return next; +} + +/*! \brief Arm the cached allocation to be reused by the next ``tp_alloc`` on + * this thread. Called by ``make_ret_object`` immediately before + * ``cls.__new__``. */ +extern "C" TVM_FFI_INLINE void TVMFFIPySetReviveBlock(PyObject* cached_alloc) { + TVMFFIPyTLSReviveSlot(cached_alloc); +} + +// Forward decl; defined below. +// +// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to +// [[gnu::always_inline]] which forbids taking the function's address as +// a stable, callable pointer — and we hand the address to the C++ side +// (stored in PyCustomAllocHeader::base.delete_space at allocate time). +inline void TVMFFIPyDeleteSpace(void* ptr); + +// Atexit-driven shutdown guard. ``TVMFFIPyMarkPythonFinalizing`` flips +// the flag to false from an atexit hook registered in Cython module init; +// ``TVMFFIPyDeleteSpace`` reads it via ``TVMFFIPyIsPythonAlive`` before +// ``PyGILState_Ensure`` to avoid touching a teardown interpreter. +inline std::atomic& TVMFFIPyAliveFlagStorage() { + static std::atomic flag{true}; + return flag; +} + +extern "C" inline bool TVMFFIPyIsPythonAlive() noexcept { + return TVMFFIPyAliveFlagStorage().load(std::memory_order_acquire); +} + +extern "C" inline void TVMFFIPyMarkPythonFinalizing() noexcept { + TVMFFIPyAliveFlagStorage().store(false, std::memory_order_release); +} + +/*! + * \brief True iff ``chandle`` was allocated through the Python custom + * allocator (full ``PyCustomAllocHeader`` ahead of it). False for + * allocations that came through libtvm_ffi's builtin default + * (only the base ``TVMFFIObjectAllocHeader``). + * + * Detection is by comparing ``base.delete_space`` against + * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter + * pointer, so multiple frontends can coexist without a flag bit on + * ``TVMFFIObject``. + */ +TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) { + if (chandle == nullptr) return false; + TVMFFIObjectAllocHeader* base = reinterpret_cast( + static_cast(chandle) - sizeof(TVMFFIObjectAllocHeader)); + return base->delete_space == &TVMFFIPyDeleteSpace; +} + +//--------------------------------------------------------------- +// State-machine helpers. +//--------------------------------------------------------------- + +extern "C" { + +/*! + * \brief Query the wrapper bound to ``chandle``. ``*out`` receives the + * *usable* (untagged) wrapper pointer; the return value says whether + * that wrapper is the live Active one. + * + * The (return, ``*out``) pair has four combinations; one cannot occur: + * (true, non-NULL) : Active -- the live canonical wrapper. + * (false, NULL) : Detached / non-Python -- no wrapper bound. + * (false, non-NULL) : Inactive -- revivable cached allocation. + * (true, NULL) : impossible -- a true (Active) return always has a + * real wrapper, so ``*out`` is non-NULL on every hit. + * + * \param chandle The chandle to query. + * \param out Receives the usable (untagged) wrapper pointer, or NULL. + * \return Whether the returned PyObject state is Active (the live canonical + * wrapper). False for Detached, Inactive, or non-Python chandles. + */ +TVM_FFI_INLINE bool TVMFFIPyTryGetAttachedPyObject(void* chandle, PyObject** out) { + if (!TVMFFIPyIsCanonical(chandle)) { + *out = nullptr; + return false; + } + PyObject* tagged = TVMFFIPyHeader(chandle)->tagged_pyobj; + *out = TVMFFIPyRemoveTag(tagged); + return tagged != nullptr && !TVMFFIPyTagIsInactive(tagged); +} + +/*! + * \brief True iff ``chandle`` is a canonical Python-allocated chandle with no + * wrapper currently bound (the Detached state). False for Active, + * Inactive, or non-Python chandles. + */ +TVM_FFI_INLINE bool TVMFFIPyIsDetached(void* chandle) { + if (!TVMFFIPyIsCanonical(chandle)) return false; + return TVMFFIPyHeader(chandle)->tagged_pyobj == nullptr; +} + +/*! + * \brief Bind ``obj`` to ``chandle`` as the canonical active PyObject. + * No-op for chandles without a Python alloc header. + */ +TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) { + if (!TVMFFIPyIsCanonical(chandle)) return; + TVMFFIPyHeader(chandle)->tagged_pyobj = obj; +} + +/*! + * \brief Clear the canonical-PyObject binding (Active -> Detached). + * No-op when ``obj`` is not the bound PyObject, or for chandles + * without a Python alloc header. + */ +TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) { + if (!TVMFFIPyIsCanonical(chandle)) return; + PyCustomAllocHeader* h = TVMFFIPyHeader(chandle); + if (h->tagged_pyobj == obj) { + h->tagged_pyobj = nullptr; + } +} + +} // extern "C" + +//--------------------------------------------------------------- +// Custom tp_alloc / tp_dealloc / tp_free and per-type slot installation. +// +// These run the dealloc handshake described in the section header above. They +// are kept in execution order -- alloc, then dealloc, then free -- with the +// DEALLOC_TRANSIT marker that couples dealloc and free declared between them. +//--------------------------------------------------------------- + +// Address of a CObject wrapper's ``chandle`` field, defined in object.pxi. +__PYX_EXTERN_C void** TVMFFICyObjectGetCHandlePtr(PyObject* ptr); + +inline void TVMFFIPyTpFree(void* self); + +/*! + * \brief True iff a wrapper of ``wrapper``'s type may be cached & revived. + * + * Requirements (all must hold, else we genuinely free and lose only + * stable-id-across-drop): + * - GC type: revival re-tracks and genuine free / reclaim use GC_Del. + * - our custom ``tp_free`` is installed: otherwise the generic free would + * reclaim the block while ``tagged_pyobj`` still points at it (UAF). + * - no instance ``__dict__`` (plain or managed): reusing a cached allocation whose dict + * region was cleared would need dict re-init we do not perform. Lean + * wrappers (the common, tested case) have ``tp_dictoffset == 0``. + * - no finalizer (``tp_finalize``, i.e. a Python ``__del__``): CPython sets + * a permanent GC-finalized bit the first time ``tp_finalize`` runs and + * never runs it again on that block. Reusing an inactive cached allocation would silently + * suppress ``__del__`` for every revived generation (the cached allocation's bit is + * already set and there is no public API to clear it). Excluding these + * types makes ``__del__`` fire correctly once per drop (genuine free each + * time); the only cost is no stable-id-across-drop. + */ +TVM_FFI_INLINE bool TVMFFIPyIsInactiveEligible(PyObject* wrapper) { + PyTypeObject* tp = Py_TYPE(wrapper); + if (!PyType_IS_GC(tp)) return false; + if (tp->tp_free != &TVMFFIPyTpFree) return false; + if (tp->tp_dictoffset != 0) return false; + if ((tp->tp_flags & Py_TPFLAGS_MANAGED_DICT) != 0) return false; + if (tp->tp_finalize != nullptr) return false; + return true; +} + +/*! + * \brief Custom ``tp_alloc``. On the revival path (an inactive cached allocation was handed + * to this thread via ``TVMFFIPySetReviveBlock``) it revives the cached allocation + * in place — same address, so ``id()`` is stable — re-initializing it + * to match ``PyType_GenericAlloc``'s contract. Otherwise (miss) it + * forwards to ``PyType_GenericAlloc`` for a fresh, tracked object. + * + * Revive-path contract (must match what ``tp_new`` expects from + * ``PyType_GenericAlloc``): + * 1. zero the body ``[sizeof(PyObject), tp_basicsize)`` so ``__cinit__`` + * sees clean fields; + * 2. ``PyObject_Init`` -> ob_refcnt = 1, ob_type, INCREF(type); + * 3. ``PyObject_GC_Track`` -> GenericAlloc returns a *tracked* object and + * ``tp_new`` does not track again, so the revive path must track. + * (No stale GC-finalized bit to clear: the design uses no tp_finalize.) + * + * Fixed-size only: ``PyObject_Init`` resets ``ob_refcnt``/``ob_type`` but not + * ``ob_size``. Every registered FFI wrapper is a fixed-size cdef class + * (``tp_itemsize == 0``), asserted below; a future variable-sized type would + * need ``PyObject_InitVar(.., nitems)`` here and a matching basicsize check. + */ +inline PyObject* TVMFFIPyTpAlloc(PyTypeObject* type, Py_ssize_t nitems) { + // Take the revive block and leave the slot NULL in one step (per-thread). + PyObject* blk = TVMFFIPyTLSReviveSlot(nullptr); + if (blk != nullptr) { + // REVIVAL: revive the inactive cached allocation at the same address. The body memset + // below assumes ``type->tp_basicsize`` equals the cached allocation's original + // basicsize. This holds because a chandle's ``type_index`` maps to one + // stable wrapper class for the life of the process, and ``make_ret_object`` + // derives both the cached allocation (from the chandle) and ``type`` (= cls for that + // same type_index) from the very same chandle on the revival path. + assert(type->tp_itemsize == 0 && + "cache-&-revive supports only fixed-size wrappers; a variable-sized " + "type needs PyObject_InitVar and a per-instance basicsize check"); + std::memset(reinterpret_cast(blk) + sizeof(PyObject), 0, + static_cast(type->tp_basicsize) - sizeof(PyObject)); + PyObject_Init(blk, type); + PyObject_GC_Track(blk); + return blk; + } + return PyType_GenericAlloc(type, nitems); // MISS: fresh (already tracked) +} + +/*! \brief Marker armed on a wrapper's ``chandle`` field while its + * ``__dealloc__`` handshake is in flight (see "The dealloc handshake" + * above). A non-NULL, non-pointer value distinct from any real chandle + * (which is >= 16-aligned), so callbacks can tell it apart on sight. */ +#define TVM_FFI_PY_DEALLOC_TRANSIT (reinterpret_cast(static_cast(1))) + +extern "C" { + +/*! + * \brief ``__dealloc__`` hook: releases the wrapper's +1 on chandle, opening + * the cache-vs-free handshake (described above) for an eligible + * canonical wrapper. Runs when the wrapper's refcount hits 0, before + * ``tp_free``. + * + * The other two paths skip the handshake entirely: + * - Ineligible wrapper: detach the binding BEFORE the DecRef, so a deleter + * firing inside it sees no stale Active back-pointer, then DecRef. + * - Non-canonical binding (not ours -- cleared by an eager move, points at + * another wrapper or an inactive cached allocation, or a chandle with no + * Python alloc header): just null ``chandle`` and DecRef. + */ +TVM_FFI_INLINE void TVMFFIPyTpDealloc(void** ptr_to_chandle, PyObject* wrapper) { + void* chandle = *ptr_to_chandle; + // chandle already released -> nothing to do: an eager move + // (``__move_handle_from__`` / rvalue arg setter) detached it to NULL. A + // DEALLOC_TRANSIT marker never outlives its own dealloc chain, so it never + // surfaces on a fresh ``__dealloc__`` -- NULL is the only released state here. + if (chandle == nullptr) return; + if (TVMFFIPyIsCanonical(chandle)) { + PyCustomAllocHeader* h = TVMFFIPyHeader(chandle); + // Past the guard above, `chandle` is a real object, so the binding (if + // ours) is the live Active wrapper -- a plain `==` is the canonical check. + if (h->tagged_pyobj == wrapper) { + if (TVMFFIPyIsInactiveEligible(wrapper)) { + // Active -> Inactive (pending): set the tag bit (the only Inactive + // tag-set) and arm the transit marker, THEN DecRef. No count read. + h->tagged_pyobj = reinterpret_cast(reinterpret_cast(wrapper) | + kPyCachedInactiveTagBit); + *ptr_to_chandle = TVM_FFI_PY_DEALLOC_TRANSIT; // tp_free / deleter settle this + TVMFFIObjectDecRef(chandle); + return; + } + // Active -> Detached: not eligible. Detach first so the chandle deleter + // (which may fire inside the DecRef below) sees no stale Active binding. + h->tagged_pyobj = nullptr; + } + } + // Tail (not eligible and non-canonical): genuine free. Null wrapper.chandle + // BEFORE DecRef so the deleter chain observes a consistent (chandle == NULL, + // no +1 owed) state. + *ptr_to_chandle = nullptr; + TVMFFIObjectDecRef(chandle); +} + +} // extern "C" + +/*! + * \brief Custom ``tp_free``, the second step of the dealloc handshake. + * + * - ``chandle == DEALLOC_TRANSIT``: the chandle outlived the DecRef, so keep + * the allocation cached Inactive for a future revival. Settle the marker to + * NULL so a later cross-thread deleter reads a fully Inactive allocation, + * and skip the real free. + * - otherwise (``chandle == NULL``): a genuine free -- the type was never + * eligible, or the deleter already deferred the free here. Mirror CPython's + * default, dispatching on GC-ness so it stays correct for every type we + * install on, GC or not. + */ +inline void TVMFFIPyTpFree(void* self) { + void** chandle_ptr = TVMFFICyObjectGetCHandlePtr(static_cast(self)); + if (*chandle_ptr == TVM_FFI_PY_DEALLOC_TRANSIT) { + // Keep cached Inactive; settle the marker so the allocation reads as such. + *chandle_ptr = nullptr; + return; + } + PyObject* op = static_cast(self); + if (PyObject_IS_GC(op)) { + PyObject_GC_Del(op); + } else { + PyObject_Free(op); + } +} + +/*! + * \brief Install ``TVMFFIPyTpAlloc`` / ``TVMFFIPyTpFree`` on a registered + * type. These slots are NOT inherited by dynamically created + * subtypes (CPython resets them per dynamic subtype), so every + * registered type must be patched at its Cython registration choke + * point. Idempotent. No-op when ``type_obj`` is not a type object. + */ +extern "C" TVM_FFI_INLINE void TVMFFIPyInstallTypeSlots(PyObject* type_obj) { +#ifdef Py_GIL_DISABLED + // Free-threaded: tying is disabled (see "Free-threaded builds" above). Leaving + // the stock slots in place keeps every chandle non-canonical, so the helpers + // no-op. Parameter referenced to avoid an unused-parameter warning. + (void)type_obj; + return; +#else + if (type_obj == nullptr || !PyType_Check(type_obj)) return; + PyTypeObject* tp = reinterpret_cast(type_obj); + tp->tp_alloc = &TVMFFIPyTpAlloc; + tp->tp_free = &TVMFFIPyTpFree; +#endif +} + +//--------------------------------------------------------------- +// Custom allocator hook. +//--------------------------------------------------------------- + +/*! + * \brief Allocator entry registered with TVMFFISetCustomAllocator at + * Cython module init. Allocates ``sizeof(PyCustomAllocHeader) + size`` bytes + * with ``alignment``, zero-inits the header to the Detached + * state, wires ``base.delete_space = &TVMFFIPyDeleteSpace``, and + * returns the T location. + * + * Handler::New static_asserts ``alignof(T) <= alignof(max_align_t)``, so + * the runtime ``alignment`` is bounded and ``base + sizeof(PyCustomAllocHeader)`` + * (= ``base + 16``) lands T naturally aligned for any T we allocate. + */ +inline void* TVMFFIPyAllocate(size_t size, size_t alignment, int32_t /*type_index*/, + void* /*context*/) { + void* base_alloc = + ::tvm::ffi::details::AlignedAllocRuntime(sizeof(PyCustomAllocHeader) + size, alignment); + auto* h = static_cast(base_alloc); + h->tagged_pyobj = nullptr; // Detached + h->base.delete_space = &TVMFFIPyDeleteSpace; + return static_cast(base_alloc) + sizeof(PyCustomAllocHeader); +} + +/*! + * \brief delete_space callback installed by TVMFFIPyAllocate, invoked from the + * C++ Weak deleter when the chandle's block is freed -- the final step + * of the dealloc handshake (Cases 1 and 2 above decide who frees the + * allocation, off the allocation's ``chandle`` marker). + * + * Acts only on a bound Inactive cached allocation (tag 1); an Active binding + * cannot reach here (a live wrapper owns +1, so the chandle cannot die), and + * guarding on the tag makes a stray Active binding a safe leak, not a UAF. + * + * Reading the marker under the GIL is load-bearing: it orders this read after + * the caching thread's ``tp_free`` NULL-write (also under the GIL), so a + * cross-thread last-ref never observes a stale marker. The chandle's malloc + * block is freed in all cases; once Python is finalizing + * (``TVMFFIPyIsPythonAlive() == false``) the allocation is leaked and only that + * block is freed. + */ +inline void TVMFFIPyDeleteSpace(void* ptr) { + void* base_alloc = static_cast(ptr) - sizeof(PyCustomAllocHeader); + auto* h = static_cast(base_alloc); + if (TVMFFIPyTagIsInactive(h->tagged_pyobj) && TVMFFIPyIsPythonAlive()) { + PyGILState_STATE gstate = PyGILState_Ensure(); + if (TVMFFIPyIsPythonAlive()) { + // Mask off the Inactive tag to recover the real allocation pointer. + PyObject* cached_alloc = TVMFFIPyRemoveTag(h->tagged_pyobj); + void** cached_chandle_ptr = TVMFFICyObjectGetCHandlePtr(cached_alloc); + if (*cached_chandle_ptr == TVM_FFI_PY_DEALLOC_TRANSIT) { + // This DecRef drove the free; defer the allocation free to tp_free. + *cached_chandle_ptr = nullptr; + } else { + // Chandle died independently; reclaim the cached allocation now. + PyObject_GC_Del(cached_alloc); + } + } + PyGILState_Release(gstate); + } + ::tvm::ffi::details::AlignedFree(base_alloc); +} + +/*! + * \brief Install ``TVMFFIPyAllocate`` as the process-wide custom allocator. + * Storage for the registered entry is a function-static so the + * address is process-stable. + */ +extern "C" TVM_FFI_INLINE int TVMFFIPyRegisterDefaultAllocator() { +#ifdef Py_GIL_DISABLED + // Free-threaded: do not install the tying allocator (see "Free-threaded + // builds" above). libtvm_ffi's builtin default allocator stays in effect, so + // every chandle is non-canonical and the state-machine helpers no-op. Report + // success -- there is nothing to install and nothing can fail. + return 0; +#else + static TVMFFICustomAllocator allocator{&TVMFFIPyAllocate, /*context=*/nullptr}; + return TVMFFISetCustomAllocator(&allocator); +#endif +} + //------------------------------------------------------------------------------------ // Helpers for Python thread-state attachment //------------------------------------------------------------------------------------ diff --git a/rust/tvm-ffi-sys/src/c_api.rs b/rust/tvm-ffi-sys/src/c_api.rs index 56209bfa0..910a0505a 100644 --- a/rust/tvm-ffi-sys/src/c_api.rs +++ b/rust/tvm-ffi-sys/src/c_api.rs @@ -384,7 +384,31 @@ pub struct TVMFFITypeInfo { pub metadata: *const TVMFFITypeMetadata, } +/// Mandatory header preceding each Object body. See `TVMFFIObjectAllocHeader` +/// in `tvm/ffi/c_api.h`. +#[repr(C)] +pub struct TVMFFIObjectAllocHeader { + pub delete_space: Option, +} + +/// Custom allocator entry. See `TVMFFICustomAllocator` in `tvm/ffi/c_api.h`. +#[repr(C)] +pub struct TVMFFICustomAllocator { + pub allocate: Option< + unsafe extern "C" fn( + size: usize, + alignment: usize, + type_index: i32, + context: *mut c_void, + ) -> *mut c_void, + >, + pub context: *mut c_void, +} + unsafe extern "C" { + pub fn TVMFFIGetCustomAllocator() -> *mut TVMFFICustomAllocator; + pub fn TVMFFISetCustomAllocator(allocator: *mut TVMFFICustomAllocator) -> i32; + pub fn TVMFFITypeKeyToIndex(type_key: *const TVMFFIByteArray, out_tindex: *mut i32) -> i32; pub fn TVMFFIFunctionGetGlobal( name: *const TVMFFIByteArray, diff --git a/rust/tvm-ffi/src/object.rs b/rust/tvm-ffi/src/object.rs index dc1970a42..8cb77ce69 100644 --- a/rust/tvm-ffi/src/object.rs +++ b/rust/tvm-ffi/src/object.rs @@ -22,7 +22,7 @@ use std::sync::atomic::AtomicU64; use crate::derive::ObjectRef; pub use tvm_ffi_sys::TVMFFITypeIndex as TypeIndex; /// Object related ABI handling -use tvm_ffi_sys::{TVMFFIObject, COMBINED_REF_COUNT_BOTH_ONE}; +use tvm_ffi_sys::{TVMFFIGetCustomAllocator, TVMFFIObject, COMBINED_REF_COUNT_BOTH_ONE}; /// Object type is by default the TVMFFIObject #[repr(C)] @@ -120,7 +120,7 @@ pub(crate) mod unsafe_ { use std::ffi::c_void; use std::sync::atomic::{fence, Ordering}; - use tvm_ffi_sys::TVMFFIObject; + use tvm_ffi_sys::{TVMFFIObject, TVMFFIObjectAllocHeader}; use tvm_ffi_sys::TVMFFIObjectDeleterFlagBitMask::{ kTVMFFIObjectDeleterFlagBitMaskBoth, kTVMFFIObjectDeleterFlagBitMaskStrong, kTVMFFIObjectDeleterFlagBitMaskWeak, @@ -197,63 +197,24 @@ pub(crate) mod unsafe_ { (obj.combined_ref_count.load(Ordering::Relaxed) >> 32) as usize } - /// Generic object deleter that works for object allocated from Box then into_raw + /// Generic object deleter for objects allocated through the registered + /// `TVMFFICustomAllocator`. pub unsafe extern "C" fn object_deleter_for_new(ptr: *mut c_void, flags: i32) where T: super::ObjectCore, { let obj = ptr as *mut T; if flags & kTVMFFIObjectDeleterFlagBitMaskStrong as i32 != 0 { - // calling destructor of the object, does not free the memory std::ptr::drop_in_place(obj); } if flags & kTVMFFIObjectDeleterFlagBitMaskWeak as i32 != 0 { - // free the memory - std::alloc::dealloc(ptr as *mut u8, std::alloc::Layout::new::()); - } - } - - pub unsafe extern "C" fn object_deleter_for_new_with_extra_items( - ptr: *mut c_void, - flags: i32, - ) where - T: super::ObjectCoreWithExtraItems, - { - let obj = ptr as *mut T; - if flags == kTVMFFIObjectDeleterFlagBitMaskBoth as i32 { - // must get extra items count before dropping the object - let extra_items_count = T::extra_items_count(&(*obj)); - std::ptr::drop_in_place(obj); - let layout = std::alloc::Layout::from_size_align( - std::mem::size_of::() + extra_items_count * std::mem::size_of::(), - std::mem::align_of::(), - ) - .unwrap(); - // free the memory - std::alloc::dealloc(ptr as *mut u8, layout); - } else { - assert_eq!(std::mem::size_of::() % std::mem::size_of::(), 0); - if flags & kTVMFFIObjectDeleterFlagBitMaskStrong as i32 != 0 { - // must get extra items count before dropping the object - let extra_items_count = T::extra_items_count(&(*obj)); - // calling destructor of the object, does not free the memory - std::ptr::drop_in_place(obj); - // record extra count in the original memory - std::ptr::write(obj as *mut u64, extra_items_count as u64); - } - if flags & kTVMFFIObjectDeleterFlagBitMaskWeak as i32 != 0 { - // read extra items count from the original memory - // note we can no longer read it by calling T::extra_items_count(&(*obj)) - // because the object is already dropped - let extra_items_count = std::ptr::read(obj as *mut u64) as usize; - let layout = std::alloc::Layout::from_size_align( - std::mem::size_of::() + extra_items_count * std::mem::size_of::(), - std::mem::align_of::(), - ) - .unwrap(); - // free the memory - std::alloc::dealloc(ptr as *mut u8, layout); - } + let header_ptr = (ptr as *mut u8) + .sub(std::mem::size_of::()) + as *mut TVMFFIObjectAllocHeader; + let delete_space = (*header_ptr) + .delete_space + .expect("TVMFFIObjectAllocHeader::delete_space must be set by the allocator"); + delete_space(ptr); } } } @@ -285,14 +246,24 @@ unsafe impl ObjectCore for Object { //--------------------- // ObjectArc //--------------------- + +/// Allocate via the process-wide `TVMFFICustomAllocator`. +unsafe fn allocate_via_registry(size: usize, alignment: usize, type_index: i32) -> *mut u8 { + let alloc = TVMFFIGetCustomAllocator(); + let allocate_fn = (*alloc) + .allocate + .expect("TVMFFICustomAllocator::allocate must be set"); + allocate_fn(size, alignment, type_index, (*alloc).context) as *mut u8 +} + impl ObjectArc { pub fn new(data: T) -> Self { unsafe { - let layout = std::alloc::Layout::new::(); - let raw_data_ptr = std::alloc::alloc(layout); - if raw_data_ptr.is_null() { - std::alloc::handle_alloc_error(layout); - } + let raw_data_ptr = allocate_via_registry( + std::mem::size_of::(), + std::mem::align_of::(), + T::type_index(), + ); let ptr = raw_data_ptr as *mut T; std::ptr::write(ptr, data); // now override the header directly @@ -322,15 +293,9 @@ impl ObjectArc { assert_eq!(std::mem::align_of::() % std::mem::align_of::(), 0); assert_eq!(std::mem::size_of::() % std::mem::align_of::(), 0); let extra_items_count = T::extra_items_count(&data); - let layout = std::alloc::Layout::from_size_align( - std::mem::size_of::() + extra_items_count * std::mem::size_of::(), - std::mem::align_of::(), - ) - .unwrap(); - let raw_data_ptr = std::alloc::alloc(layout); - if raw_data_ptr.is_null() { - std::alloc::handle_alloc_error(layout); - } + let total_size = std::mem::size_of::() + extra_items_count * std::mem::size_of::(); + let raw_data_ptr = + allocate_via_registry(total_size, std::mem::align_of::(), T::type_index()); let ptr = raw_data_ptr as *mut T; std::ptr::write(ptr, data); // now override the header directly @@ -340,7 +305,7 @@ impl ObjectArc { combined_ref_count: AtomicU64::new(COMBINED_REF_COUNT_BOTH_ONE), type_index: T::type_index(), __padding: 0, - deleter: Some(unsafe_::object_deleter_for_new_with_extra_items::), + deleter: Some(unsafe_::object_deleter_for_new::), }, ); // move into the object arc ptr diff --git a/src/ffi/custom_allocator.cc b/src/ffi/custom_allocator.cc new file mode 100644 index 000000000..09edeab54 --- /dev/null +++ b/src/ffi/custom_allocator.cc @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * \file src/ffi/custom_allocator.cc + * \brief Process-wide registry for the custom Object allocator and the + * builtin default allocator behind it. See ObjAllocatorBase / + * Handler::New in for the allocator contract. + */ +#include +#include +#include +#include +#include + +namespace tvm { +namespace ffi { +namespace { + +// Fixed body offset: the deleter gets only the body pointer (no size/alignment), +// so allocate and free must agree on a single offset. alignof(max_align_t) is a +// multiple of every supported object alignment (capped there in memory.h) and is +// >= the header, so the body stays aligned and the header fits before it. +constexpr size_t kBuiltinDefaultBodyOffset = alignof(std::max_align_t); +static_assert(kBuiltinDefaultBodyOffset >= sizeof(TVMFFIObjectAllocHeader)); + +void BuiltinDefaultDeleteSpace(void* ptr) { + details::AlignedFree(static_cast(ptr) - kBuiltinDefaultBodyOffset); +} + +void* BuiltinDefaultAllocate(size_t size, size_t alignment, int32_t /*type_index*/, + void* /*context*/) { + void* base_alloc = details::AlignedAllocRuntime(kBuiltinDefaultBodyOffset + size, alignment); + void* ptr = static_cast(base_alloc) + kBuiltinDefaultBodyOffset; + details::ObjectUnsafe::GetObjectAllocHeaderFromPtr(ptr)->delete_space = + &BuiltinDefaultDeleteSpace; + return ptr; +} + +class CustomAllocatorRegistry { + public: + CustomAllocatorRegistry() : current_(BuiltinDefault()) {} + + TVMFFICustomAllocator* Get() const { return current_; } + + void Set(TVMFFICustomAllocator* allocator) { + current_ = allocator != nullptr ? allocator : BuiltinDefault(); + } + + static CustomAllocatorRegistry* Global() { + static CustomAllocatorRegistry inst; + return &inst; + } + + private: + static TVMFFICustomAllocator* BuiltinDefault() { + static TVMFFICustomAllocator builtin{&BuiltinDefaultAllocate, /*context=*/nullptr}; + return &builtin; + } + + TVMFFICustomAllocator* current_; +}; + +} // namespace +} // namespace ffi +} // namespace tvm + +TVMFFICustomAllocator* TVMFFIGetCustomAllocator(void) { + TVM_FFI_LOG_EXCEPTION_CALL_BEGIN(); + return tvm::ffi::CustomAllocatorRegistry::Global()->Get(); + TVM_FFI_LOG_EXCEPTION_CALL_END(TVMFFIGetCustomAllocator); +} + +int TVMFFISetCustomAllocator(TVMFFICustomAllocator* allocator) { + TVM_FFI_SAFE_CALL_BEGIN(); + tvm::ffi::CustomAllocatorRegistry::Global()->Set(allocator); + TVM_FFI_SAFE_CALL_END(); +} diff --git a/src/ffi/extra/dataclass.cc b/src/ffi/extra/dataclass.cc index 5032d1390..7693cfd47 100644 --- a/src/ffi/extra/dataclass.cc +++ b/src/ffi/extra/dataclass.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -1700,7 +1701,9 @@ class RecursiveComparer : public ObjectGraphDFS(self_void); @@ -1720,7 +1723,7 @@ void PyClassDeleter(void* self_void, int flags) { }); } if (flags & kTVMFFIObjectDeleterFlagBitMaskWeak) { - std::free(self_void); + details::ObjectUnsafe::GetObjectAllocHeaderFromPtr(self_void)->delete_space(self_void); } } @@ -2041,16 +2044,20 @@ void PyClassRegisterTypeAttrColumns(int32_t type_index, int32_t total_size) { RegisterFFIInit(type_index); // Step 2. Register `__ffi_new__` Function new_fn = Function::FromTyped([type_index, total_size]() -> ObjectRef { - void* obj_ptr = std::calloc(1, static_cast(total_size)); - if (!obj_ptr) { - TVM_FFI_THROW(RuntimeError) << "Failed to allocate " << total_size << " bytes for type " - << TypeIndexToTypeKey(type_index); - } + // Route through the custom-allocator registry so the prepended + // TVMFFIObjectAllocHeader is in place for PyClassDeleter's Weak + // branch. Then memset to zero-init the T payload (mirrors the + // original calloc-based path). + size_t alloc_size = static_cast(total_size); + TVMFFICustomAllocator* alloc = TVMFFIGetCustomAllocator(); + void* obj_ptr = + alloc->allocate(alloc_size, alignof(::std::max_align_t), type_index, alloc->context); + std::memset(obj_ptr, 0, alloc_size); TVMFFIObject* ffi_obj = reinterpret_cast(obj_ptr); ffi_obj->type_index = type_index; ffi_obj->combined_ref_count = details::kCombinedRefCountBothOne; ffi_obj->deleter = PyClassDeleter; - // calloc zero-initializes all bytes. For non-trivial field types: + // memset above zero-initializes all bytes. For non-trivial field types: // - Any: zero state is {type_index=kTVMFFINone, v_int64=0}, representing None. // - ObjectRef: zero state is a null pointer. // Both are valid initial states whose destructors and assignment operators @@ -2064,10 +2071,12 @@ void PyClassRegisterTypeAttrColumns(int32_t type_index, int32_t total_size) { // Step 3. Register `__ffi_shallow_copy__` Function copy_fn = Function::FromTyped([type_index, total_size, type_info](const Object* src) -> ObjectRef { - void* obj_ptr = std::calloc(1, static_cast(total_size)); - if (!obj_ptr) { - TVM_FFI_THROW(RuntimeError) << "Failed to allocate for shallow copy"; - } + // Allocator + memset mirror RegisterFFINew above. + size_t alloc_size = static_cast(total_size); + TVMFFICustomAllocator* alloc = TVMFFIGetCustomAllocator(); + void* obj_ptr = + alloc->allocate(alloc_size, alignof(::std::max_align_t), type_index, alloc->context); + std::memset(obj_ptr, 0, alloc_size); TVMFFIObject* ffi_obj = reinterpret_cast(obj_ptr); ffi_obj->type_index = type_index; ffi_obj->combined_ref_count = details::kCombinedRefCountBothOne; diff --git a/tests/python/test_function.py b/tests/python/test_function.py index 362911880..a920939f9 100644 --- a/tests/python/test_function.py +++ b/tests/python/test_function.py @@ -177,40 +177,46 @@ def echo(x: Any) -> Any: assert tvm_ffi.get_global_func("mytest.echo", allow_missing=True) is None +@pytest.mark.skipif( + hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled(), + reason="PyObject-tying (wrapper aliasing) is disabled on free-threaded Python", +) def test_rvalue_ref() -> None: + # Under universal cache-on the callback's arg aliases the caller's + # wrapper, so use_count inside is 1 (one wrapper, one chandle ref). + # ``_move()`` on either side detaches that wrapper's binding before + # the C++ AnyViewToOwnedAny transfer nulls the source chandle. use_count = tvm_ffi.get_global_func("testing.object_use_count") def callback(x: Any, expected_count: int) -> Any: - # The use count of TVM FFI objects is decremented as part of - # `ObjectRef.__del__`, which runs when the Python object is - # destructed. However, Python object destruction is not - # deterministic, and even CPython's reference-counting is - # considered an implementation detail. Therefore, to ensure - # correct results from this test, `gc.collect()` must be - # explicitly called. + # ``gc.collect()`` ensures Python destructors have run so + # use_count reflects only live wrappers. gc.collect() assert expected_count == use_count(x) return x._move() f = tvm_ffi.convert(callback) - def check0() -> None: + def check_caller_move() -> None: + # Caller passes ``x._move()``: callback receives a fresh canonical + # wrapper for the moved-in chandle. x = tvm_ffi.convert([1, 2]) assert use_count(x) == 1 - f(x, 2) f(x._move(), 1) assert x.__ctypes_handle__().value is None - def check1() -> None: + def check_callback_move() -> None: + # Callback returns ``x._move()``: caller sees a fresh canonical + # wrapper, distinct from the now-empty ``x``. x = tvm_ffi.convert([1, 2]) assert use_count(x) == 1 - y = f(x, 2) - f(x._move(), 2) + y = f(x, 1) + assert y is not x assert x.__ctypes_handle__().value is None assert y.__ctypes_handle__().value is not None - check0() - check1() + check_caller_move() + check_callback_move() def test_echo_with_opaque_object() -> None: diff --git a/tests/python/test_pyobject_tying.py b/tests/python/test_pyobject_tying.py new file mode 100644 index 000000000..6d8bceaf8 --- /dev/null +++ b/tests/python/test_pyobject_tying.py @@ -0,0 +1,761 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for the PyObject-tying feature (cache & revive). + +A 16-byte ``PyCustomAllocHeader`` is prepended to every Object that goes +through the Python custom allocator. Its ``tagged_pyobj`` field is a tagged +pointer to the canonical Python wrapper, encoding a three-state machine +driven by the custom ``tp_alloc`` / ``tp_free`` slots and the +``make_ret_object`` dispatcher in ``tvm_ffi_python_helpers.h``: + + Detached no Python wrapper bound to the chandle (raw == NULL) + Active canonical wrapper alive; ``wrapper.chandle == chandle`` (tag 0) + Inactive wrapper dropped but its memory is cached as a dead, + untracked allocation (tag 1); revived in place at the same address + on the next ``make_ret_object`` + +Active gives ``a.x is a.x``. Inactive gives stable ``id()`` across a +drop-then-rewrap cycle when the C++ object outlives the wrapper — without a +leak: the cached allocation is revived (``tp_alloc``), not resurrected. + +Note: some class/method names below retain the older "StateB" (Active) and +"StateC" (Inactive) labels from the resurrection-era design; the behavior they +assert is unchanged under cache & revive. +""" + +from __future__ import annotations + +import gc +import itertools +import pickle +import sys +import threading +import weakref +from typing import Any + +import pytest +import tvm_ffi +import tvm_ffi.testing +from tvm_ffi import dataclasses as dc + +# --------------------------------------------------------------------------- +# Test type registration +# --------------------------------------------------------------------------- +_counter = itertools.count() + + +def _unique_key(base: str) -> str: + return f"testing.pyobject_tying.{base}_{next(_counter)}" + + +def _is_free_threaded_python() -> bool: + return hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled() + + +# PyObject-tying (wrapper identity stickiness: ``a.x is a.x``, stable ``id()`` +# across drop+revive, ``f(x) is x``) relies on a custom allocator + tp_alloc / +# tp_free slots whose header state machine is GIL-serialized. On free-threaded +# builds the feature is disabled wholesale (see the "Free-threaded builds" note +# in ``tvm_ffi_python_helpers.h``): correctness and value semantics are kept, +# but identity stickiness is not. Tests that assert identity stickiness are +# skipped there; value / no-crash / no-leak / distinctness tests stay active. +requires_tying = pytest.mark.skipif( + _is_free_threaded_python(), + reason="PyObject-tying (wrapper identity stickiness) is disabled on free-threaded Python", +) + + +# Module-level fixtures so the registered types persist across all tests +# in a file (re-registering with the same key is an error). + + +@dc.py_class(_unique_key("Inner")) +class Inner(dc.Object): + val: int + + +@dc.py_class(_unique_key("Outer")) +class Outer(dc.Object): + x: Inner + + +@dc.py_class(_unique_key("MutableOuter"), frozen=False) +class MutableOuter(dc.Object): + x: Inner + + +# --------------------------------------------------------------------------- +# Active: identity stable while wrapper is alive +# --------------------------------------------------------------------------- +@requires_tying +class TestStateBIdStable: + """``a.x is a.x`` and ``id(a.x)`` stable while ``a`` lives.""" + + def test_attr_access_is_same_object(self) -> None: + """Repeated attribute access returns the same wrapper object.""" + a = Outer(Inner(42)) + assert a.x is a.x + + def test_attr_access_id_stable(self) -> None: + """id() of an attribute is stable across repeated access.""" + a = Outer(Inner(7)) + ids = {id(a.x) for _ in range(100)} + assert len(ids) == 1 + + def test_chained_attr_access(self) -> None: + """Chained attribute access reuses the intermediate wrapper.""" + # Two-level access: outer.x.val triggers two FieldGetters; the + # intermediate Inner wrapper must be reused so chained access + # doesn't create transient garbage at different addresses. + a = Outer(Inner(11)) + assert a.x is a.x + assert a.x.val == 11 + assert a.x.val == 11 + + def test_two_outers_distinct_inners(self) -> None: + """Distinct C++ objects produce distinct Python wrappers.""" + # Different Outer instances wrapping different Inner C++ objects + # MUST produce distinct Python wrappers. + a = Outer(Inner(1)) + b = Outer(Inner(2)) + assert a.x is not b.x + assert id(a.x) != id(b.x) + + +# --------------------------------------------------------------------------- +# Universal cache-on: function returns alias the canonical wrapper +# --------------------------------------------------------------------------- +@requires_tying +class TestStateBFunctionReturns: + """Every FFI return funnels through ``make_ret_object``: the wrapper + for a chandle that already has a canonical Python wrapper *is* that + wrapper. + + Note: ``get_global_func("name")`` does NOT participate in this cache + (most registry entries are allocated at C++ static init, before the + Python custom allocator is registered, so their chandle has no + PyCustomAllocHeader). Stable ``id()`` for ``get_global_func`` is + deferred — see TODO in ``function.pxi::_get_global_func``. + """ + + def test_function_return_aliases_arg(self) -> None: + """An FFI return aliases the canonical wrapper of its argument.""" + identity = tvm_ffi.convert(lambda x: x) + x = tvm_ffi.convert([1, 2]) + assert identity(x) is x + + +# --------------------------------------------------------------------------- +# Active -> Inactive -> Active: identity preserved across wrapper drop +# --------------------------------------------------------------------------- +class TestStateCRevive: + """When the C++ object outlives the wrapper, re-fetching it via the + cached field-getter path must reuse the preserved memory. + """ + + @requires_tying + def test_id_preserved_across_finalize(self) -> None: + """id() and chandle survive a wrapper finalize-and-revive cycle.""" + outer = Outer(Inner(1)) + first = outer.x # Active: canonical wrapper installed + first_id = id(first) + first_chandle = first.__chandle__() + del first + gc.collect() # transitions to Inactive + + revived = outer.x # revive: same address + assert id(revived) == first_id + assert revived.__chandle__() == first_chandle + + @requires_tying + def test_id_preserved_across_many_drops(self) -> None: + """Each drop+refetch cycle reuses the same wrapper address.""" + outer = Outer(Inner(1)) + first_id = id(outer.x) + # Deterministic per cycle: each drop+refetch must reuse the same + # address, so a modest repeat count surfaces any regression. The + # cost here is dominated by gc.collect() (~13ms each in this env). + for _ in range(20): + ref = outer.x + ref_id = id(ref) + del ref + gc.collect() + assert ref_id == first_id + + @requires_tying + def test_no_leak_churn_2k_cycles(self) -> None: + """Revive must reuse exactly one address across many cycles.""" + outer = Outer(Inner(1)) + addresses: set[int] = set() + for i in range(2000): + ref = outer.x + addresses.add(id(ref)) + del ref + if i % 500 == 499: + gc.collect() + gc.collect() + assert len(addresses) == 1, ( + f"Inactive -> Active revive should yield exactly one wrapper address; " + f"saw {len(addresses)}" + ) + + def test_chandle_unchanged_across_revive(self) -> None: + """The chandle is unchanged across a revive cycle.""" + outer = Outer(Inner(99)) + chandle1 = outer.x.__chandle__() + gc.collect() + chandle2 = outer.x.__chandle__() + assert chandle1 == chandle2 + + def test_field_value_correct_after_revive(self) -> None: + """Field value is correct after the wrapper is revived.""" + outer = Outer(Inner(123)) + _ = outer.x # ensure wrapper exists then dies + gc.collect() + assert outer.x.val == 123 + + def test_chandle_dies_while_in_state_c(self) -> None: + """A chandle dying while Inactive reclaims the cached allocation safely.""" + # When the chandle finally dies while its wrapper is Inactive, + # ``TVMFFIPyDeleteSpace`` must reclaim the cached (untracked) + # allocation under the GIL via ``PyObject_GC_Del`` and then free the + # malloc block. A regression here typically manifests as a + # segfault — getting past ``gc.collect()`` and the follow-up + # alloc means the path stayed safe. + # + # A UAF/corruption here is deterministic -- it trips within the first + # few iterations (the alloc-after-free that surfaces heap damage needs + # only a handful of repeats), so a small count suffices. Each iter runs + # two gc.collect()s (~13ms each in this env), so keep the count modest. + for _ in range(15): + outer = Outer(Inner(7)) + _ = outer.x # Detached -> Active + gc.collect() # wrapper drops; Active -> Inactive + del outer # drops the last C++ ref; deleter fires on the inactive cached allocation + gc.collect() + # If we got here, the cached-allocation cleanup path is safe. + fresh = Outer(Inner(11)) + assert fresh.x.val == 11 + + +# --------------------------------------------------------------------------- +# No unbounded wrapper leak across many distinct chandles +# --------------------------------------------------------------------------- +class TestNoUnboundedLeak: + """Many distinct chandles, each pinned alive by a parent's field, must + not accumulate one live Python wrapper per chandle. + + Root cause this guards against (the PEP-442 resurrection-era design that + cache & revive replaced): finalize resurrected the wrapper on the + Active -> Inactive transition with ``Py_IncRef(wrapper)`` -- a phantom +1 + reclaimed only when the chandle itself died. But ``tp_finalize`` runs at + most once per object (CPython sets a permanent GC-finalized bit), so a + wrapper that went Inactive and was later revived (Inactive -> Active) + never finalized again and stayed pinned by its own phantom refcount. + Every distinct chandle that was transiently wrapped and dropped then + leaked exactly one live wrapper for as long as that chandle stayed alive + (e.g. held by a parent's field) -- an unbounded leak. + + ``TestStateCRevive.test_no_leak_churn_2k_cycles`` cycles a *single* + chandle and asserts address reuse; this asserts the complementary + property -- across *many distinct* chandles the count of live wrappers + stays flat, because cache & revive reuses the cached allocation instead of pinning it. + """ + + def test_distinct_chandles_no_wrapper_leak(self) -> None: + """Many distinct chandles do not pin one live wrapper each.""" + + # Locally-registered types: the gc.get_objects() scan below counts + # only instances of *this* InnerLeak, so leftover instances from other + # tests (which use the module-level Inner) cannot pollute the count. + @dc.py_class(_unique_key("InnerLeak")) + class InnerLeak(dc.Object): + val: int + + @dc.py_class(_unique_key("OuterLeak")) + class OuterLeak(dc.Object): + x: InnerLeak + + def live_inner_count() -> int: + return sum(1 for ob in gc.get_objects() if type(ob) is InnerLeak) + + # n is deliberately modest: the PEP-442 leak is strictly linear (one + # pinned wrapper per distinct chandle), so leaked == 0 trips at any n + # under a regression -- a larger n buys no detection power, only time. + n = 1000 + parents = [] + gc.collect() + before = live_inner_count() + for i in range(n): + outer = OuterLeak(InnerLeak(i)) + parents.append(outer) # keep the chandle legitimately alive + w = outer.x # Active wrapper for this distinct chandle + del w # -> Inactive (revivable cached allocation, must not pin the wrapper) + gc.collect() + leaked = live_inner_count() - before + assert leaked == 0, ( + f"PyObject-tying leak: {leaked} wrappers pinned alive (expected 0; " + f"one pinned wrapper per distinct chandle == unbounded leak)" + ) + + +# --------------------------------------------------------------------------- +# Finalizer (__del__) types are excluded from inactivation +# --------------------------------------------------------------------------- +class TestFinalizerNotInactivated: + """A subclass that defines ``__del__`` gets a ``tp_finalize`` slot, and + CPython sets a *permanent* GC-finalized bit the first time it runs — + never running it again on that block. Reviving an inactive cached allocation would + therefore silently suppress ``__del__`` for every revived generation. + ``TVMFFIPyIsInactiveEligible`` excludes such types, so they genuinely free on + every drop and ``__del__`` fires once per generation. The cost is no + stable-``id``-across-drop for finalizer types (acceptable, documented). + """ + + def test_del_fires_every_generation(self) -> None: + """__del__ fires on every drop of a finalizer type, not once-ever.""" + calls: list[int] = [] + + @dc.py_class(_unique_key("InnerDel")) + class InnerDel(dc.Object): + val: int + + def __del__(self) -> None: + # ``self`` is being finalized; record that __del__ ran. + calls.append(1) + + @dc.py_class(_unique_key("OuterDel")) + class OuterDel(dc.Object): + x: InnerDel + + outer = OuterDel(InnerDel(5)) + for _ in range(5): + w = outer.x # fresh wrapper (finalizer type is never inactivated) + assert w.val == 5 + del w + gc.collect() # genuine free -> __del__ runs + # __del__ fired once per drop (5), not once-ever (the cached-allocation bug). + assert len(calls) >= 5, ( + f"__del__ must fire on every drop of a finalizer type; saw {len(calls)}" + ) + + @requires_tying + def test_finalizer_type_value_correct_across_refetch(self) -> None: + """A finalizer type's value and live identity hold across refetch.""" + + # Even without stable id(), the value and live identity must hold. + @dc.py_class(_unique_key("InnerDel")) + class InnerDel2(dc.Object): + val: int + + def __del__(self) -> None: + pass + + @dc.py_class(_unique_key("OuterDel")) + class OuterDel2(dc.Object): + x: InnerDel2 + + outer = OuterDel2(InnerDel2(7)) + assert outer.x is outer.x # live identity still holds + assert outer.x.val == 7 + gc.collect() + assert outer.x.val == 7 # refetch after drop still correct + + +# --------------------------------------------------------------------------- +# Inactivation only happens when other C++ holders exist +# --------------------------------------------------------------------------- +class TestLastRefCleanFree: + """When the wrapper is the last C++ ref, ``__dealloc__`` must NOT + inactivate — it detaches and genuinely frees the allocation (``tp_free`` runs + the real free). Regression guard for use-after-free in the deleter. + """ + + def test_drop_last_ref_no_crash(self) -> None: + """The bare convert-and-drop pattern must not segfault.""" + for _ in range(100): + x = tvm_ffi.convert([1, 2, 3]) + del x + gc.collect() + + def test_drop_then_realloc_value_correct(self) -> None: + """Dropping the last ref frees the memory; a fresh alloc is correct.""" + # Inner has no other holder -> __dealloc__ with strong_count == 1 -> + # genuine free (no inactivation), memory reclaimed. Next Inner is fresh. + a = Inner(1) + del a + gc.collect() + b = Inner(2) + assert b.val == 2 + + +# --------------------------------------------------------------------------- +# RValue-ref move path (regression for the test_rvalue_ref segfault) +# --------------------------------------------------------------------------- +class TestRValueRef: + """The ``_move()`` path nulls the source's ``chandle`` while the + Python wrapper stays alive. The RValueRef arg setter detaches the + canonical-wrapper binding eagerly so a downstream cache lookup + doesn't see a stale back-pointer to a wrapper whose chandle is NULL. + """ + + def test_move_into_callback_no_crash(self) -> None: + """Moving an arg into a callback that re-moves it does not crash.""" + # Universal cache-on: callback arg aliases caller's ``x`` (one + # wrapper, one chandle ref). The original blocker was a UAF in + # the deleter path triggered by the eager-detach + chandle-null + # interplay during a callback that returns ``x._move()`` — so we + # exercise that path here, asserting only that we don't crash + # and the post-call chandle is null. + use_count = tvm_ffi.get_global_func("testing.object_use_count") + + def callback(x: Any, expected: int) -> Any: + gc.collect() + assert expected == use_count(x) + return x._move() + + f = tvm_ffi.convert(callback) + x = tvm_ffi.convert([1, 2]) + # Caller-side _move: the rvalue setter detaches and the + # callback receives a fresh canonical wrapper for the chandle. + f(x._move(), 1) + assert x.__ctypes_handle__().value is None + + def test_ffi_move_then_re_fetch_works(self) -> None: + """Re-fetching a field after an FFI move yields a valid wrapper.""" + # The rvalue setter only runs when an FFI call consumes the + # ObjectRValueRef. ``discard`` provides that consumption — the + # setter detaches the canonical-wrapper binding (clearing + # ``tagged_pyobj`` to NULL) and the C++ side nulls the + # source wrapper's ``chandle``. ``outer``'s field still owns + # its strong ref, so re-fetching produces a valid wrapper. + outer = Outer(Inner(5)) + discard = tvm_ffi.convert(lambda _: None) + discard(outer.x._move()) + assert outer.x.val == 5 + + @requires_tying + def test_state_b_works_after_ffi_move(self) -> None: + """Active caching resumes on a fresh wrapper after an FFI move.""" + # Eager-detach during the move clears the header binding, so + # the chandle returns to Detached. The next access installs a fresh + # canonical wrapper, which then participates in Active caching + # like any other freshly-wrapped chandle. + outer = Outer(Inner(5)) + discard = tvm_ffi.convert(lambda _: None) + discard(outer.x._move()) + assert outer.x is outer.x + assert outer.x.val == 5 + + +# --------------------------------------------------------------------------- +# Pickle round-trip: must not corrupt the binding +# --------------------------------------------------------------------------- +class TestPickle: + """Pickle uses ``__init_handle_by_constructor__`` which calls + ``_install_chandle_binding`` — the binding must end in a valid + Active configuration. + """ + + def test_pickle_roundtrip_basic(self) -> None: + """A basic pickle round-trip preserves the value.""" + a = tvm_ffi.convert([1, 2, 3]) + s = pickle.dumps(a) + b = pickle.loads(s) + assert list(b) == [1, 2, 3] + + @requires_tying + def test_pickle_roundtrip_preserves_attr_identity(self) -> None: + """A restored wrapper has stable attribute identity.""" + outer = Outer(Inner(77)) + s = pickle.dumps(outer) + outer2 = pickle.loads(s) + # Restored wrapper is fully functional and identity is stable + # within the new instance. + assert outer2.x is outer2.x + assert outer2.x.val == 77 + + +# --------------------------------------------------------------------------- +# PyNativeObject (String / Bytes) — exempt from the header binding +# --------------------------------------------------------------------------- +class TestPyNativeExempt: + """PyNativeObject types (``String``, ``Bytes``) are value-typed: the + transient ``Object`` wrapper is discarded after construction. + They MUST NOT install a header binding. + """ + + def test_string_construction_and_compare(self) -> None: + """A converted String behaves as a native str.""" + s = tvm_ffi.convert("hello") + assert isinstance(s, str) + assert s == "hello" + + def test_bytes_construction_and_compare(self) -> None: + """A converted Bytes behaves as native bytes.""" + b = tvm_ffi.convert(b"world") + assert isinstance(b, bytes) + assert b == b"world" + + def test_string_repeated_construction(self) -> None: + """Repeated String construction does not corrupt adjacent headers.""" + # Repeatedly constructing strings shouldn't break the header on + # adjacent objects. + for _ in range(100): + s = tvm_ffi.convert("x" * 16) + assert s == "x" * 16 + + +# --------------------------------------------------------------------------- +# Type-mismatch fallthrough on revive +# --------------------------------------------------------------------------- +class TestTypeMismatchOnRevive: + """Registering many unrelated types between attribute accesses must + not corrupt existing wrappers' bindings, and the revive path must + still hit Inactive for the original wrapper. + """ + + @requires_tying + def test_field_access_works_after_many_unrelated_registrations(self) -> None: + """Unrelated type registrations don't corrupt an existing binding.""" + outer = Outer(Inner(1)) + first_id = id(outer.x) + # Register a bunch of unrelated classes (no shared chandle). + for _ in range(20): + + @dc.py_class(_unique_key("Tmp")) + class Tmp(dc.Object): + v: int + + _ = Tmp(0) + gc.collect() + # Original outer's revive cycle still works. + revived = outer.x + assert id(revived) == first_id + assert revived.val == 1 + + +# --------------------------------------------------------------------------- +# Field setter after revive +# --------------------------------------------------------------------------- +class TestFieldSetterAfterRevive: + """Replacing a field after a revive cycle should bind the new value + cleanly without disturbing the prior preserved wrapper for the + *previous* chandle (which may still be alive elsewhere). + """ + + @requires_tying + def test_assign_then_access(self) -> None: + """Replacing a field after a revive cycle binds the new value cleanly.""" + outer = MutableOuter(Inner(1)) + first_inner = outer.x + first_chandle = first_inner.__chandle__() + del first_inner + + # Replace x with a fresh Inner. + new_inner = Inner(2) + outer.x = new_inner + gc.collect() + + # The new outer.x has a different chandle (= new_inner's), so it + # must produce a wrapper that is NOT the preserved address of + # the original (which had a different chandle). + revised = outer.x + assert revised.val == 2 + assert revised.__chandle__() != first_chandle + # The new wrapper is canonical for its own chandle. + assert outer.x is revised + + +# --------------------------------------------------------------------------- +# Pickle stress: repeated round-trips +# --------------------------------------------------------------------------- +@requires_tying +class TestPickleStress: + """Pickle round-trips go through ``__init_handle_by_constructor__`` + which calls ``_install_chandle_binding``. Repeated round-trips must + not leak wrappers or corrupt the binding state. + """ + + def test_many_roundtrips(self) -> None: + """Repeated pickle round-trips keep the binding state valid.""" + for _ in range(200): + outer = Outer(Inner(7)) + restored = pickle.loads(pickle.dumps(outer)) + assert restored.x.val == 7 + assert restored.x is restored.x # Active on restored binding + + def test_roundtrip_then_state_c_revive(self) -> None: + """A pickle-installed binding survives a finalize-and-revive cycle.""" + # Round-trip, then exercise Inactive -> Active revive on the restored + # wrapper to confirm the binding installed by pickle survives + # finalize+rewrap. + outer = pickle.loads(pickle.dumps(Outer(Inner(99)))) + first_id = id(outer.x) + gc.collect() + assert id(outer.x) == first_id + assert outer.x.val == 99 + + +# --------------------------------------------------------------------------- +# Threading stress: concurrent attribute access +# --------------------------------------------------------------------------- +class TestThreadingStress: + """Multi-threaded smoke test for the cache-hit / Inactive revive + paths. Under classic CPython the GIL serializes Python bytecode so + most races are confined to the FFI call's nogil block; this test + exercises enough iterations to surface obvious crashes from any + GIL-released DecRef on the C++ side. + """ + + def test_concurrent_attr_access(self) -> None: + """Concurrent attribute access on a shared object does not crash.""" + outer = Outer(Inner(123)) + errors: list[BaseException] = [] + + def worker() -> None: + try: + for _ in range(2000): + assert outer.x.val == 123 + except BaseException as e: + errors.append(e) + + threads = [threading.Thread(target=worker) for _ in range(8)] + for t in threads: + t.start() + for t in threads: + t.join() + assert not errors, f"worker(s) raised: {errors!r}" + + def test_concurrent_independent_outers(self) -> None: + """Concurrent wrapper churn on per-thread objects does not crash.""" + # Each thread owns its own Outer; threads don't share chandles + # so the test mostly exercises wrapper churn (Active -> Inactive -> Active) in + # isolation. ``gc.collect()`` is called periodically (not every + # iteration) to keep the test under a few seconds. + errors: list[BaseException] = [] + + def worker(seed: int) -> None: + try: + outer = Outer(Inner(seed)) + for i in range(200): + ref = outer.x + assert ref.val == seed + del ref + if i % 20 == 19: + gc.collect() + except BaseException as e: + errors.append(e) + + threads = [threading.Thread(target=worker, args=(i,)) for i in range(8)] + for t in threads: + t.start() + for t in threads: + t.join() + assert not errors, f"worker(s) raised: {errors!r}" + + +# --------------------------------------------------------------------------- +# GC integration with Inactive +# --------------------------------------------------------------------------- +class TestStateCWithCyclicGC: + """Inactive cached allocations are dead, untracked memory (``subtype_dealloc`` + untracks before inactivation, and ``tp_alloc`` re-tracks only on revival). + ``gc.collect()`` between a drop and a refetch must not crash: it must + neither traverse the untracked cached allocation nor trip over the revived object. + """ + + def test_gc_collect_inside_revive_loop(self) -> None: + """gc.collect() between drop and refetch does not crash.""" + outer = Outer(Inner(5)) + # A bad traversal of the untracked cached allocation crashes deterministically on + # the first collect, so a modest count suffices; the cost here is the + # gc.collect() itself (~13ms each in this env), not the FFI work. + for _ in range(25): + ref = outer.x + del ref + gc.collect() # cached allocation is inactive (untracked) between drops + assert outer.x.val == 5 + + @requires_tying + def test_gc_collect_with_holder_keeping_chandle(self) -> None: + """gc.collect() around inactive memory keeps id() stable.""" + # The Inner chandle is held by ``outer``'s field for the lifetime + # of the test, so wrapper drops always inactivate the allocation. gc.collect + # between drops exercises GC stability around the inactive memory. + outer = Outer(Inner(11)) + first_id = id(outer.x) + # GC stability around the inactive cached allocation is deterministic per cycle; + # a modest count surfaces it. Dominated by gc.collect() (~13ms each). + for _ in range(20): + _ = outer.x + gc.collect() + assert id(outer.x) == first_id + + +# --------------------------------------------------------------------------- +# Isolation: many distinct chandles each Inactive at once +# --------------------------------------------------------------------------- +@requires_tying +class TestMultipleChandlesIsolation: + """A revive on one chandle must not corrupt the cached binding on + another. Each chandle's header is independent — verify by cycling + many in parallel. + """ + + def test_distinct_state_c_revive_independently(self) -> None: + """Each chandle revives to its own address and value independently.""" + outers = [Outer(Inner(i * 10)) for i in range(50)] + # Capture Active addresses, then drop to Inactive. + b_ids = [id(o.x) for o in outers] + gc.collect() + # Revive each independently; each must return its own preserved + # address with its own value. + for i, o in enumerate(outers): + revived = o.x + assert id(revived) == b_ids[i] + assert revived.val == i * 10 + + def test_interleaved_revives_do_not_cross_contaminate(self) -> None: + """Reviving in a different order keeps ids matched per-chandle.""" + # Build N outers, capture ids, drop all, then revive in a + # different order. Ids must still match per-chandle. + outers = [Outer(Inner(i)) for i in range(30)] + ids = [id(o.x) for o in outers] + gc.collect() + for i in reversed(range(30)): + assert id(outers[i].x) == ids[i] + + +# --------------------------------------------------------------------------- +# Weakref limitation (sentinel) +# --------------------------------------------------------------------------- +class TestWeakrefNotSupported: + """``CObject`` is a Cython cdef class without a ``__weakref__`` slot, + so weakrefs are not supported. This test documents that limitation: + if it ever starts passing, the Inactive-state interaction needs + design review (a weakref to an Active wrapper transitioning to Inactive + must behave correctly — the wrapper genuinely dies at refcount 0, so + old weakrefs should observe death and a revived wrapper should bind a + fresh weakref; see design §9.4). + """ + + def test_weakref_raises_typeerror(self) -> None: + """Taking a weakref to a wrapper raises TypeError (documented limit).""" + outer = Outer(Inner(1)) + with pytest.raises(TypeError): + weakref.ref(outer.x)