From abdec47f2c3de514a02d14f08fffe3fc097ed729 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 16 Mar 2026 17:37:49 -0700
Subject: [PATCH 01/31] wip

---
 cuda_core/cuda/core/_memory/_buffer.pxd       |   1 +
 cuda_core/cuda/core/_memory/_buffer.pyx       | 284 ++++++++++++++++++
 cuda_core/docs/source/release/0.7.x-notes.rst |   5 +
 cuda_core/tests/test_memory.py                | 127 ++++++++
 4 files changed, 417 insertions(+)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index 91c0cfe24a..04b5707e18 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -12,6 +12,7 @@ cdef struct _MemAttrs:
     int device_id
     bint is_device_accessible
     bint is_host_accessible
+    bint is_managed
 
 
 cdef class Buffer:
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 83009f74ae..686585b527 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -72,6 +72,194 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
 :attr:`Buffer.handle`.
 """
 
+
+cdef tuple _VALID_MANAGED_LOCATION_TYPES = (
+    "device",
+    "host",
+    "host_numa",
+    "host_numa_current",
+)
+
+cdef dict _MANAGED_LOCATION_TYPE_ATTRS = {
+    "device": "CU_MEM_LOCATION_TYPE_DEVICE",
+    "host": "CU_MEM_LOCATION_TYPE_HOST",
+    "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA",
+    "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
+}
+
+cdef dict _MANAGED_ADVICE_ALIASES = {
+    "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
+    "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
+    "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+    "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+    "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+    "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+    "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+    "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+    "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
+    "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
+    "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+    "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+}
+
+cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
+    "set_read_mostly",
+    "unset_read_mostly",
+    "unset_preferred_location",
+))
+
+cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset((
+    "set_accessed_by",
+    "unset_accessed_by",
+))
+
+
+cdef inline object _managed_location_enum(str location_type):
+    cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
+    if not hasattr(driver.CUmemLocationType, attr_name):
+        raise RuntimeError(
+            f"Managed-memory location type {location_type!r} is not supported by the "
+            f"installed cuda.bindings package."
+        )
+    return getattr(driver.CUmemLocationType, attr_name)
+
+
+cdef inline object _make_managed_location(str location_type, int location_id):
+    cdef object location = driver.CUmemLocation()
+    location.type = _managed_location_enum(location_type)
+    if location_type == "host":
+        location.id = int(getattr(driver, "CU_DEVICE_CPU", -1))
+    elif location_type == "host_numa_current":
+        location.id = 0
+    else:
+        location.id = location_id
+    return location
+
+
+cdef inline tuple _normalize_managed_advice(object advice):
+    cdef str alias
+    cdef str attr_name
+    if isinstance(advice, str):
+        alias = advice.lower()
+        attr_name = _MANAGED_ADVICE_ALIASES.get(alias)
+        if attr_name is None:
+            raise ValueError(
+                "advice must be one of "
+                f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}"
+            )
+        return alias, getattr(driver.CUmem_advise, attr_name)
+
+    if isinstance(advice, driver.CUmem_advise):
+        for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
+            if alias.startswith("cu_mem_advise_"):
+                continue
+            if advice == getattr(driver.CUmem_advise, attr_name):
+                return alias, advice
+        raise ValueError(f"Unsupported advice value: {advice!r}")
+
+    raise TypeError(
+        "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias"
+    )
+
+
+cdef inline object _normalize_managed_location(
+    object location,
+    object location_type,
+    str what,
+    bint allow_none=False,
+    bint allow_host=True,
+    bint allow_host_numa=True,
+    bint allow_host_numa_current=True,
+):
+    cdef object loc_type
+    cdef int loc_id
+
+    if isinstance(location, Device):
+        location = (<Device>location).device_id
+
+    if location_type is not None and not isinstance(location_type, str):
+        raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}")
+
+    loc_type = None if location_type is None else (<str>location_type).lower()
+    if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES:
+        raise ValueError(
+            f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} "
+            f"or None, got {location_type!r}"
+        )
+
+    if loc_type is None:
+        if location is None:
+            if allow_none:
+                return _make_managed_location("host", -1)
+            raise ValueError(f"{what} requires a location")
+        if not isinstance(location, int):
+            raise TypeError(
+                f"{what} location must be a Device, int, or None, got {type(location).__name__}"
+            )
+        loc_id = <int>location
+        if loc_id == -1:
+            loc_type = "host"
+        elif loc_id >= 0:
+            loc_type = "device"
+        else:
+            raise ValueError(
+                f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}"
+            )
+    elif loc_type == "device":
+        if isinstance(location, int) and <int>location >= 0:
+            loc_id = <int>location
+        else:
+            raise ValueError(
+                f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}"
+            )
+        return _make_managed_location(loc_type, loc_id)
+    elif loc_type == "host":
+        if location not in (None, -1):
+            raise ValueError(
+                f"{what} location must be None or -1 when location_type is 'host', got {location!r}"
+            )
+        if not allow_host:
+            raise ValueError(f"{what} does not support location_type='host'")
+        return _make_managed_location(loc_type, -1)
+    elif loc_type == "host_numa":
+        if not allow_host_numa:
+            raise ValueError(f"{what} does not support location_type='host_numa'")
+        if not isinstance(location, int) or <int>location < 0:
+            raise ValueError(
+                f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}"
+            )
+        return _make_managed_location(loc_type, <int>location)
+    else:
+        if not allow_host_numa_current:
+            raise ValueError(f"{what} does not support location_type='host_numa_current'")
+        if location is not None:
+            raise ValueError(
+                f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
+            )
+        return _make_managed_location(loc_type, 0)
+
+    if loc_type == "host" and not allow_host:
+        raise ValueError(f"{what} does not support host locations")
+    if loc_type == "host_numa" and not allow_host_numa:
+        raise ValueError(f"{what} does not support location_type='host_numa'")
+    if loc_type == "host_numa_current" and not allow_host_numa_current:
+        raise ValueError(f"{what} does not support location_type='host_numa_current'")
+    return _make_managed_location(<str>loc_type, loc_id)
+
+
+cdef inline void _require_managed_buffer(Buffer self, str what):
+    _init_mem_attrs(self)
+    if not self._mem_attrs.is_managed:
+        raise ValueError(f"{what} requires a managed-memory buffer")
+
+
+cdef inline void _require_managed_discard_prefetch_support():
+    if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+        raise RuntimeError(
+            "Buffer.discard_prefetch requires cuda.bindings support for "
+            "cuMemDiscardAndPrefetchBatchAsync"
+        )
+
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -293,6 +481,99 @@ cdef class Buffer:
         finally:
             PyBuffer_Release(&buf)
 
+    def advise(
+        self,
+        advice: driver.CUmem_advise | str,
+        location: Device | int | None = None,
+        *,
+        location_type: str | None = None,
+    ):
+        """Apply a managed-memory advice to this buffer.
+
+        This method is only valid for buffers backed by managed memory.
+
+        Parameters
+        ----------
+        advice : :obj:`~driver.CUmem_advise` | str
+            Managed-memory advice to apply. String aliases such as
+            ``"set_read_mostly"``, ``"set_preferred_location"``, and
+            ``"set_accessed_by"`` are accepted.
+        location : :obj:`~_device.Device` | int | None, optional
+            Target location. When ``location_type`` is ``None``, values are
+            interpreted as a device ordinal, ``-1`` for host, or ``None`` for
+            advice values that ignore location.
+        location_type : str | None, optional
+            Explicit location kind. Supported values are ``"device"``,
+            ``"host"``, ``"host_numa"``, and ``"host_numa_current"``.
+        """
+        cdef str advice_name
+        _require_managed_buffer(self, "Buffer.advise")
+        advice_name, advice = _normalize_managed_advice(advice)
+        location = _normalize_managed_location(
+            location,
+            location_type,
+            "Buffer.advise",
+            allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
+            allow_host=True,
+            allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY,
+            allow_host_numa_current=advice_name == "set_preferred_location",
+        )
+        handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location))
+
+    def prefetch(
+        self,
+        location: Device | int | None = None,
+        *,
+        stream: Stream | GraphBuilder,
+        location_type: str | None = None,
+    ):
+        """Prefetch this managed-memory buffer to a target location."""
+        cdef Stream s = Stream_accept(stream)
+        _require_managed_buffer(self, "Buffer.prefetch")
+        location = _normalize_managed_location(
+            location,
+            location_type,
+            "Buffer.prefetch",
+            allow_none=False,
+            allow_host=True,
+            allow_host_numa=True,
+            allow_host_numa_current=True,
+        )
+        handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle))
+
+    def discard_prefetch(
+        self,
+        location: Device | int | None = None,
+        *,
+        stream: Stream | GraphBuilder,
+        location_type: str | None = None,
+    ):
+        """Discard this managed-memory buffer and prefetch it to a target location."""
+        cdef Stream s = Stream_accept(stream)
+        _require_managed_buffer(self, "Buffer.discard_prefetch")
+        _require_managed_discard_prefetch_support()
+        location = _normalize_managed_location(
+            location,
+            location_type,
+            "Buffer.discard_prefetch",
+            allow_none=False,
+            allow_host=True,
+            allow_host_numa=True,
+            allow_host_numa_current=True,
+        )
+        handle_return(
+            driver.cuMemDiscardAndPrefetchBatchAsync(
+                [self.handle],
+                [self._size],
+                1,
+                [location],
+                [0],
+                1,
+                0,
+                s.handle,
+            )
+        )
+
     def __dlpack__(
         self,
         *,
@@ -453,6 +734,7 @@ cdef inline int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
+        out.is_managed = False
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -461,10 +743,12 @@ cdef inline int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
+        out.is_managed = is_managed != 0
     elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
         out.is_host_accessible = False
         out.is_device_accessible = True
         out.device_id = device_id
+        out.is_managed = False
     else:
         with cython.gil:
             raise ValueError(f"Unsupported memory type: {memory_type}")
diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst
index 98551603b6..18b3bede36 100644
--- a/cuda_core/docs/source/release/0.7.x-notes.rst
+++ b/cuda_core/docs/source/release/0.7.x-notes.rst
@@ -35,6 +35,11 @@ New features
   preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or
   ``("host_numa", 3)``.
 
+- Added managed-memory controls on :class:`Buffer`: ``advise()``,
+  ``prefetch()``, and ``discard_prefetch()``. These methods validate that the
+  underlying allocation is managed memory and then forward to the corresponding
+  CUDA driver operations for range advice and migration.
+
 - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit
   control over host NUMA node placement. When ``ipc_enabled=True`` and
   ``numa_id`` is not set, the NUMA node is automatically derived from the
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 0473d2d183..dd146785ec 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1134,6 +1134,133 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+def _get_mem_range_attr(buffer, attribute, data_size):
+    return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size))
+
+
+def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+        pytest.skip("discard-prefetch requires cuda.bindings support")
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(4096)
+    stream = device.create_stream()
+
+    buffer.advise("set_read_mostly")
+    assert _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        4,
+    ) == 1
+
+    buffer.advise("set_preferred_location", device, location_type="device")
+    preferred_type = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
+        4,
+    )
+    preferred_id = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
+        4,
+    )
+    assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
+    assert preferred_id == device.device_id
+
+    buffer.prefetch(-1, stream=stream)
+    stream.sync()
+    last_type = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
+        4,
+    )
+    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST)
+
+    buffer.discard_prefetch(device, stream=stream)
+    stream.sync()
+    last_type = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
+        4,
+    )
+    last_id = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID,
+        4,
+    )
+    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
+    assert last_id == device.device_id
+
+    buffer.close()
+
+
+def test_managed_buffer_operations_support_external_managed_allocations(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    stream = device.create_stream()
+
+    buffer.prefetch(device, stream=stream)
+    stream.sync()
+
+    last_type = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
+        4,
+    )
+    last_id = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID,
+        4,
+    )
+    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
+    assert last_id == device.device_id
+
+    buffer.close()
+
+
+def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda):
+    device = Device()
+    device.set_current()
+
+    buffer = DummyDeviceMemoryResource(device).allocate(4096)
+    stream = device.create_stream()
+
+    with pytest.raises(ValueError, match="managed-memory buffer"):
+        buffer.advise("set_read_mostly")
+    with pytest.raises(ValueError, match="managed-memory buffer"):
+        buffer.prefetch(device, stream=stream)
+    with pytest.raises(ValueError, match="managed-memory buffer"):
+        buffer.discard_prefetch(device, stream=stream)
+
+    buffer.close()
+
+
+def test_managed_buffer_operation_validation(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(4096)
+    stream = device.create_stream()
+
+    with pytest.raises(ValueError, match="requires a location"):
+        buffer.prefetch(stream=stream)
+    with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
+        buffer.advise("set_accessed_by", 0, location_type="host_numa")
+    with pytest.raises(ValueError, match="location must be None or -1"):
+        buffer.prefetch(0, stream=stream, location_type="host")
+
+    buffer.close()
+
+
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch

From c418050043ef38cc15a74e733d9038d564068c0d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 16 Mar 2026 17:44:49 -0700
Subject: [PATCH 02/31] wip

---
 cuda_core/tests/test_memory.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index dd146785ec..44d50e356c 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1151,11 +1151,14 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda):
     stream = device.create_stream()
 
     buffer.advise("set_read_mostly")
-    assert _get_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
-        4,
-    ) == 1
+    assert (
+        _get_mem_range_attr(
+            buffer,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+            4,
+        )
+        == 1
+    )
 
     buffer.advise("set_preferred_location", device, location_type="device")
     preferred_type = _get_mem_range_attr(

From b879fa5b13922b2a41122f31751cd11c0c1fbaee Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 16 Mar 2026 17:51:36 -0700
Subject: [PATCH 03/31] fixing ci compiler errors

---
 cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 686585b527..05a1667b3f 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -36,7 +36,7 @@ else:
     BufferProtocol = object
 
 from cuda.core._dlpack import DLDeviceType, make_py_capsule
-from cuda.core._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver, handle_return
 from cuda.core._device import Device
 
 
@@ -175,7 +175,7 @@ cdef inline object _normalize_managed_location(
     cdef int loc_id
 
     if isinstance(location, Device):
-        location = (<Device>location).device_id
+        location = location.device_id
 
     if location_type is not None and not isinstance(location_type, str):
         raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}")

From 04ee3de1859c91158f30a7bffd3246024d422f0e Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 09:07:10 -0700
Subject: [PATCH 04/31] skipping tests that aren't supported

---
 cuda_core/tests/test_memory.py | 130 ++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 44 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 44d50e356c..95c6e6e964 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1138,18 +1138,70 @@ def _get_mem_range_attr(buffer, attribute, data_size):
     return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size))
 
 
-def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda):
-    device = Device()
-    skip_if_managed_memory_unsupported(device)
-    device.set_current()
+def _skip_if_managed_allocation_unsupported(device):
+    try:
+        if not device.properties.managed_memory:
+            pytest.skip("Device does not support managed memory operations")
+    except AttributeError:
+        pytest.skip("Managed-memory buffer operations require CUDA support")
+
 
+def _skip_if_managed_location_ops_unsupported(device):
+    _skip_if_managed_allocation_unsupported(device)
+    try:
+        if not device.properties.concurrent_managed_access:
+            pytest.skip("Device does not support concurrent managed memory access")
+    except AttributeError:
+        pytest.skip("Managed-memory location operations require CUDA support")
+
+
+def _skip_if_managed_discard_prefetch_unsupported(device):
+    _skip_if_managed_location_ops_unsupported(device)
     if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
         pytest.skip("discard-prefetch requires cuda.bindings support")
 
+    visible_devices = Device.get_all_devices()
+    if not all(dev.properties.concurrent_managed_access for dev in visible_devices):
+        pytest.skip("discard-prefetch requires concurrent managed access on all visible devices")
+
+
+def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
     mr = create_managed_memory_resource_or_skip()
     buffer = mr.allocate(4096)
     stream = device.create_stream()
 
+    buffer.prefetch(-1, stream=stream)
+    stream.sync()
+    last_location = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        4,
+    )
+    assert last_location == -1
+
+    buffer.prefetch(device, stream=stream)
+    stream.sync()
+    last_location = _get_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        4,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
+def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda):
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+
     buffer.advise("set_read_mostly")
     assert (
         _get_mem_range_attr(
@@ -1160,70 +1212,60 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda):
         == 1
     )
 
-    buffer.advise("set_preferred_location", device, location_type="device")
-    preferred_type = _get_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE,
-        4,
-    )
-    preferred_id = _get_mem_range_attr(
+    # cuda.bindings currently exposes the combined location attributes for
+    # cuMemRangeGetAttribute, so use the legacy location query here.
+    buffer.advise("set_preferred_location", location_type="host")
+    preferred_location = _get_mem_range_attr(
         buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
         4,
     )
-    assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
-    assert preferred_id == device.device_id
+    assert preferred_location == -1
 
-    buffer.prefetch(-1, stream=stream)
-    stream.sync()
-    last_type = _get_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
-        4,
-    )
-    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST)
+    buffer.close()
 
-    buffer.discard_prefetch(device, stream=stream)
+
+def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda):
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    stream = device.create_stream()
+
+    buffer.prefetch(device, stream=stream)
     stream.sync()
-    last_type = _get_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
-        4,
-    )
-    last_id = _get_mem_range_attr(
+
+    last_location = _get_mem_range_attr(
         buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
         4,
     )
-    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
-    assert last_id == device.device_id
+    assert last_location == device.device_id
 
     buffer.close()
 
 
-def test_managed_buffer_operations_support_external_managed_allocations(init_cuda):
+def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda):
     device = Device()
-    skip_if_managed_memory_unsupported(device)
+    _skip_if_managed_discard_prefetch_unsupported(device)
     device.set_current()
 
     buffer = DummyUnifiedMemoryResource(device).allocate(4096)
     stream = device.create_stream()
 
-    buffer.prefetch(device, stream=stream)
+    buffer.prefetch(-1, stream=stream)
     stream.sync()
 
-    last_type = _get_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE,
-        4,
-    )
-    last_id = _get_mem_range_attr(
+    buffer.discard_prefetch(device, stream=stream)
+    stream.sync()
+
+    last_location = _get_mem_range_attr(
         buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
         4,
     )
-    assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE)
-    assert last_id == device.device_id
+    assert last_location == device.device_id
 
     buffer.close()
 

From 9ab3f465d1c7d072a6dd9c6b8b70a9b47a24f3d8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 09:34:29 -0700
Subject: [PATCH 05/31] cu12 support

---
 cuda_core/cuda/core/_memory/_buffer.pyx | 40 ++++++++++++++++++--
 cuda_core/tests/test_memory.py          | 50 ++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 05a1667b3f..4460de900d 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -36,7 +36,7 @@ else:
     BufferProtocol = object
 
 from cuda.core._dlpack import DLDeviceType, make_py_capsule
-from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
 from cuda.core._device import Device
 
 
@@ -247,6 +247,20 @@ cdef inline object _normalize_managed_location(
     return _make_managed_location(<str>loc_type, loc_id)
 
 
+cdef inline bint _managed_location_uses_v2_bindings():
+    # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
+    return get_binding_version() >= (13, 0)
+
+
+cdef inline int _managed_location_to_legacy_device(object location, str what):
+    cdef object loc_type = location.type
+    if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"):
+        return <int>location.id
+    raise RuntimeError(
+        f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}"
+    )
+
+
 cdef inline void _require_managed_buffer(Buffer self, str what):
     _init_mem_attrs(self)
     if not self._mem_attrs.is_managed:
@@ -518,7 +532,17 @@ cdef class Buffer:
             allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY,
             allow_host_numa_current=advice_name == "set_preferred_location",
         )
-        handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location))
+        if _managed_location_uses_v2_bindings():
+            handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location))
+        else:
+            handle_return(
+                driver.cuMemAdvise(
+                    self.handle,
+                    self._size,
+                    advice,
+                    _managed_location_to_legacy_device(location, "Buffer.advise"),
+                )
+            )
 
     def prefetch(
         self,
@@ -539,7 +563,17 @@ cdef class Buffer:
             allow_host_numa=True,
             allow_host_numa_current=True,
         )
-        handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle))
+        if _managed_location_uses_v2_bindings():
+            handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle))
+        else:
+            handle_return(
+                driver.cuMemPrefetchAsync(
+                    self.handle,
+                    self._size,
+                    _managed_location_to_legacy_device(location, "Buffer.prefetch"),
+                    s.handle,
+                )
+            )
 
     def discard_prefetch(
         self,
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 95c6e6e964..380b581e7b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -43,7 +43,7 @@
     system as ccx_system,
 )
 from cuda.core._dlpack import DLDeviceType
-from cuda.core._memory import IPCBufferDescriptor
+from cuda.core._memory import IPCBufferDescriptor, _buffer
 from cuda.core._utils.cuda_utils import CUDAError, handle_return
 from cuda.core.utils import StridedMemoryView
 
@@ -1270,6 +1270,54 @@ def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(i
     buffer.close()
 
 
+def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda):
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    calls = []
+
+    def fake_cuMemAdvise(ptr, size, advice, location):
+        calls.append((ptr, size, advice, location))
+        return (driver.CUresult.CUDA_SUCCESS,)
+
+    monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9))
+    monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise)
+
+    buffer.advise("set_read_mostly")
+
+    assert len(calls) == 1
+    assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1))
+
+    buffer.close()
+
+
+def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda):
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    stream = device.create_stream()
+    calls = []
+
+    def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
+        calls.append((ptr, size, location, hstream))
+        return (driver.CUresult.CUDA_SUCCESS,)
+
+    monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9))
+    monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
+
+    buffer.prefetch(device, stream=stream)
+
+    assert len(calls) == 1
+    assert calls[0][2] == device.device_id
+    assert int(calls[0][3]) == int(stream.handle)
+
+    buffer.close()
+
+
 def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda):
     device = Device()
     device.set_current()

From a948066ab2fc6fda3dfb74516538091e96e68746 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 16:45:51 -0700
Subject: [PATCH 06/31] Moving to function from Buffer class methods to free
 standing functions in the cuda.core.managed_memory namespace

---
 cuda_core/cuda/core/__init__.py               |   2 +-
 cuda_core/cuda/core/_memory/_buffer.pyx       | 322 +++++++++++-------
 cuda_core/cuda/core/experimental/__init__.py  |   3 +-
 cuda_core/cuda/core/managed_memory.py         |   9 +
 cuda_core/docs/source/api.rst                 |  13 +
 cuda_core/docs/source/release/0.7.x-notes.rst |  10 +-
 cuda_core/pixi.lock                           |  18 +-
 .../test_experimental_backward_compat.py      |   7 +
 cuda_core/tests/test_memory.py                | 137 +++++---
 9 files changed, 335 insertions(+), 186 deletions(-)
 create mode 100644 cuda_core/cuda/core/managed_memory.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 139078e86e..c55c0786ed 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -28,7 +28,7 @@
 finally:
     del bindings, importlib, subdir, cuda_major, cuda_minor
 
-from cuda.core import system, utils
+from cuda.core import managed_memory, system, utils
 from cuda.core._device import Device
 from cuda.core._event import Event, EventOptions
 from cuda.core._graph import (
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 8ae6d22ee5..4663302b34 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -113,6 +113,13 @@ cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset((
     "unset_accessed_by",
 ))
 
+cdef int _MANAGED_SIZE_NOT_PROVIDED = -1
+cdef int _HOST_NUMA_CURRENT_ID = 0
+cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0
+cdef size_t _SINGLE_RANGE_COUNT = 1
+cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1
+cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
+
 
 cdef inline object _managed_location_enum(str location_type):
     cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
@@ -130,7 +137,7 @@ cdef inline object _make_managed_location(str location_type, int location_id):
     if location_type == "host":
         location.id = int(getattr(driver, "CU_DEVICE_CPU", -1))
     elif location_type == "host_numa_current":
-        location.id = 0
+        location.id = _HOST_NUMA_CURRENT_ID
     else:
         location.id = location_id
     return location
@@ -236,7 +243,7 @@ cdef inline object _normalize_managed_location(
             raise ValueError(
                 f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
             )
-        return _make_managed_location(loc_type, 0)
+        return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID)
 
     if loc_type == "host" and not allow_host:
         raise ValueError(f"{what} does not support host locations")
@@ -264,16 +271,206 @@ cdef inline int _managed_location_to_legacy_device(object location, str what):
 cdef inline void _require_managed_buffer(Buffer self, str what):
     _init_mem_attrs(self)
     if not self._mem_attrs.is_managed:
-        raise ValueError(f"{what} requires a managed-memory buffer")
+        raise ValueError(f"{what} requires a managed-memory allocation")
 
 
-cdef inline void _require_managed_discard_prefetch_support():
+cdef inline void _require_managed_discard_prefetch_support(str what):
     if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
         raise RuntimeError(
-            "Buffer.discard_prefetch requires cuda.bindings support for "
-            "cuMemDiscardAndPrefetchBatchAsync"
+            f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync"
         )
 
+
+cdef inline tuple _managed_range_from_buffer(
+    Buffer buffer,
+    int size,
+    str what,
+):
+    if size != _MANAGED_SIZE_NOT_PROVIDED:
+        raise TypeError(f"{what} does not accept size= when target is a Buffer")
+    _require_managed_buffer(buffer, what)
+    return buffer.handle, buffer._size
+
+
+cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0:
+    cdef object ptr_obj
+    try:
+        ptr_obj = int(target)
+    except Exception as exc:
+        raise TypeError(
+            f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}"
+        ) from exc
+    if ptr_obj < 0:
+        raise ValueError(f"{what} target pointer must be >= 0, got {target!r}")
+    return <uintptr_t>ptr_obj
+
+
+cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1:
+    cdef _MemAttrs mem_attrs
+    with nogil:
+        _query_memory_attrs(mem_attrs, <cydriver.CUdeviceptr>ptr)
+    if not mem_attrs.is_managed:
+        raise ValueError(f"{what} requires a managed-memory allocation")
+    return 0
+
+
+cdef inline tuple _normalize_managed_target_range(
+    object target,
+    int size,
+    str what,
+):
+    cdef uintptr_t ptr
+
+    if isinstance(target, Buffer):
+        return _managed_range_from_buffer(<Buffer>target, size, what)
+
+    if size == _MANAGED_SIZE_NOT_PROVIDED:
+        raise TypeError(f"{what} requires size= when target is a raw pointer")
+    ptr = _coerce_raw_pointer(target, what)
+    _require_managed_pointer(ptr, what)
+    return ptr, <size_t>size
+
+
+def advise(
+    target,
+    advice: driver.CUmem_advise | str,
+    location: Device | int | None = None,
+    *,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Apply managed-memory advice to an allocation range.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    advice : :obj:`~driver.CUmem_advise` | str
+        Managed-memory advice to apply. String aliases such as
+        ``"set_read_mostly"``, ``"set_preferred_location"``, and
+        ``"set_accessed_by"`` are accepted.
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None`` for
+        advice values that ignore location.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
+    cdef str advice_name
+    cdef object ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "advise")
+    advice_name, advice = _normalize_managed_advice(advice)
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "advise",
+        allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
+        allow_host=True,
+        allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY,
+        allow_host_numa_current=advice_name == "set_preferred_location",
+    )
+    if _managed_location_uses_v2_bindings():
+        handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location))
+    else:
+        handle_return(
+            driver.cuMemAdvise(
+                ptr,
+                nbytes,
+                advice,
+                _managed_location_to_legacy_device(location, "advise"),
+            )
+        )
+
+
+def prefetch(
+    target,
+    location: Device | int | None = None,
+    *,
+    stream: Stream | GraphBuilder,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Prefetch a managed-memory allocation range to a target location."""
+    cdef Stream s = Stream_accept(stream)
+    cdef object ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch")
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "prefetch",
+        allow_none=False,
+        allow_host=True,
+        allow_host_numa=True,
+        allow_host_numa_current=True,
+    )
+    if _managed_location_uses_v2_bindings():
+        handle_return(
+            driver.cuMemPrefetchAsync(
+                ptr,
+                nbytes,
+                location,
+                _MANAGED_OPERATION_FLAGS,
+                s.handle,
+            )
+        )
+    else:
+        handle_return(
+            driver.cuMemPrefetchAsync(
+                ptr,
+                nbytes,
+                _managed_location_to_legacy_device(location, "prefetch"),
+                s.handle,
+            )
+        )
+
+
+def discard_prefetch(
+    target,
+    location: Device | int | None = None,
+    *,
+    stream: Stream | GraphBuilder,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Discard a managed-memory allocation range and prefetch it to a target location."""
+    cdef Stream s = Stream_accept(stream)
+    cdef object ptr
+    cdef object batch_ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
+    batch_ptr = driver.CUdeviceptr(int(ptr))
+    _require_managed_discard_prefetch_support("discard_prefetch")
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "discard_prefetch",
+        allow_none=False,
+        allow_host=True,
+        allow_host_numa=True,
+        allow_host_numa_current=True,
+    )
+    handle_return(
+        driver.cuMemDiscardAndPrefetchBatchAsync(
+            [batch_ptr],
+            [nbytes],
+            _SINGLE_RANGE_COUNT,
+            [location],
+            [_FIRST_PREFETCH_LOCATION_INDEX],
+            _SINGLE_PREFETCH_LOCATION_COUNT,
+            _MANAGED_OPERATION_FLAGS,
+            s.handle,
+        )
+    )
+
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -502,119 +699,6 @@ cdef class Buffer:
         finally:
             PyBuffer_Release(&buf)
 
-    def advise(
-        self,
-        advice: driver.CUmem_advise | str,
-        location: Device | int | None = None,
-        *,
-        location_type: str | None = None,
-    ):
-        """Apply a managed-memory advice to this buffer.
-
-        This method is only valid for buffers backed by managed memory.
-
-        Parameters
-        ----------
-        advice : :obj:`~driver.CUmem_advise` | str
-            Managed-memory advice to apply. String aliases such as
-            ``"set_read_mostly"``, ``"set_preferred_location"``, and
-            ``"set_accessed_by"`` are accepted.
-        location : :obj:`~_device.Device` | int | None, optional
-            Target location. When ``location_type`` is ``None``, values are
-            interpreted as a device ordinal, ``-1`` for host, or ``None`` for
-            advice values that ignore location.
-        location_type : str | None, optional
-            Explicit location kind. Supported values are ``"device"``,
-            ``"host"``, ``"host_numa"``, and ``"host_numa_current"``.
-        """
-        cdef str advice_name
-        _require_managed_buffer(self, "Buffer.advise")
-        advice_name, advice = _normalize_managed_advice(advice)
-        location = _normalize_managed_location(
-            location,
-            location_type,
-            "Buffer.advise",
-            allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
-            allow_host=True,
-            allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY,
-            allow_host_numa_current=advice_name == "set_preferred_location",
-        )
-        if _managed_location_uses_v2_bindings():
-            handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location))
-        else:
-            handle_return(
-                driver.cuMemAdvise(
-                    self.handle,
-                    self._size,
-                    advice,
-                    _managed_location_to_legacy_device(location, "Buffer.advise"),
-                )
-            )
-
-    def prefetch(
-        self,
-        location: Device | int | None = None,
-        *,
-        stream: Stream | GraphBuilder,
-        location_type: str | None = None,
-    ):
-        """Prefetch this managed-memory buffer to a target location."""
-        cdef Stream s = Stream_accept(stream)
-        _require_managed_buffer(self, "Buffer.prefetch")
-        location = _normalize_managed_location(
-            location,
-            location_type,
-            "Buffer.prefetch",
-            allow_none=False,
-            allow_host=True,
-            allow_host_numa=True,
-            allow_host_numa_current=True,
-        )
-        if _managed_location_uses_v2_bindings():
-            handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle))
-        else:
-            handle_return(
-                driver.cuMemPrefetchAsync(
-                    self.handle,
-                    self._size,
-                    _managed_location_to_legacy_device(location, "Buffer.prefetch"),
-                    s.handle,
-                )
-            )
-
-    def discard_prefetch(
-        self,
-        location: Device | int | None = None,
-        *,
-        stream: Stream | GraphBuilder,
-        location_type: str | None = None,
-    ):
-        """Discard this managed-memory buffer and prefetch it to a target location."""
-        cdef Stream s = Stream_accept(stream)
-        _require_managed_buffer(self, "Buffer.discard_prefetch")
-        _require_managed_discard_prefetch_support()
-        location = _normalize_managed_location(
-            location,
-            location_type,
-            "Buffer.discard_prefetch",
-            allow_none=False,
-            allow_host=True,
-            allow_host_numa=True,
-            allow_host_numa_current=True,
-        )
-        handle_return(
-            driver.cuMemDiscardAndPrefetchBatchAsync(
-                [self.handle],
-                [self._size],
-                1,
-                [location],
-                [0],
-                1,
-                0,
-                s.handle,
-            )
-        )
-
     def __dlpack__(
         self,
         *,
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index e7989f0f26..83fb1c7581 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -38,9 +38,10 @@ def _warn_deprecated():
 _warn_deprecated()
 
 
-from cuda.core import system, utils
+from cuda.core import managed_memory, system, utils
 
 # Make utils accessible as a submodule for backward compatibility
+__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory
 __import__("sys").modules[__spec__.name + ".utils"] = utils
 
 
diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
new file mode 100644
index 0000000000..f11aabcd19
--- /dev/null
+++ b/cuda_core/cuda/core/managed_memory.py
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Managed-memory range operations."""
+
+from cuda.core._memory._buffer import advise, discard_prefetch, prefetch
+
+__all__ = ["advise", "prefetch", "discard_prefetch"]
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index fa7ce48eb5..4d63bbcf88 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -62,6 +62,19 @@ CUDA runtime
    on other non-blocking streams.
 
 
+.. module:: cuda.core.managed_memory
+
+Managed memory
+--------------
+
+.. autosummary::
+   :toctree: generated/
+
+   advise
+   prefetch
+   discard_prefetch
+
+
 CUDA compilation toolchain
 --------------------------
 
diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst
index 18b3bede36..186e3181f1 100644
--- a/cuda_core/docs/source/release/0.7.x-notes.rst
+++ b/cuda_core/docs/source/release/0.7.x-notes.rst
@@ -35,10 +35,12 @@ New features
   preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or
   ``("host_numa", 3)``.
 
-- Added managed-memory controls on :class:`Buffer`: ``advise()``,
-  ``prefetch()``, and ``discard_prefetch()``. These methods validate that the
-  underlying allocation is managed memory and then forward to the corresponding
-  CUDA driver operations for range advice and migration.
+- Added managed-memory range operations under :mod:`cuda.core.managed_memory`:
+  ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free
+  functions accept either a managed :class:`Buffer` or a raw pointer plus
+  ``size=``, validate that the target allocation is managed memory, and then
+  forward to the corresponding CUDA driver operations for range advice and
+  migration.
 
 - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit
   control over host NUMA node placement. When ``ipc_enabled=True`` and
diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock
index 78da9addb5..e2f8b7b0c2 100644
--- a/cuda_core/pixi.lock
+++ b/cuda_core/pixi.lock
@@ -2598,7 +2598,7 @@ packages:
   subdir: win-64
   variants:
     c_compiler: vs2022
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     cxx_compiler: vs2022
     python: 3.14.*
     target_platform: win-64
@@ -2625,7 +2625,7 @@ packages:
   build: py314h9a28ecd_0
   subdir: linux-aarch64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-aarch64
   depends:
@@ -2653,7 +2653,7 @@ packages:
   build: py314hb727236_0
   subdir: linux-64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-64
   depends:
@@ -2794,7 +2794,7 @@ packages:
   subdir: win-64
   variants:
     c_compiler: vs2022
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     cxx_compiler: vs2022
     python: 3.14.*
     target_platform: win-64
@@ -2817,7 +2817,7 @@ packages:
   subdir: win-64
   variants:
     c_compiler: vs2022
-    cuda-version: 12.*
+    cuda_version: 12.*
     cxx_compiler: vs2022
     python: 3.14.*
     target_platform: win-64
@@ -2840,7 +2840,7 @@ packages:
   build: py314h9a28ecd_0
   subdir: linux-aarch64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-aarch64
   depends:
@@ -2862,7 +2862,7 @@ packages:
   build: py314ha6d028f_0
   subdir: linux-64
   variants:
-    cuda-version: 12.*
+    cuda_version: 12.*
     python: 3.14.*
     target_platform: linux-64
   depends:
@@ -2884,7 +2884,7 @@ packages:
   build: py314hb727236_0
   subdir: linux-64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-64
   depends:
@@ -2906,7 +2906,7 @@ packages:
   build: py314he8946ed_0
   subdir: linux-aarch64
   variants:
-    cuda-version: 12.*
+    cuda_version: 12.*
     python: 3.14.*
     target_platform: linux-aarch64
   depends:
diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py
index c3215b056a..82e2cdd5be 100644
--- a/cuda_core/tests/test_experimental_backward_compat.py
+++ b/cuda_core/tests/test_experimental_backward_compat.py
@@ -38,6 +38,7 @@ def test_experimental_backward_compatibility():
     assert hasattr(cuda.core.experimental, "Device")
     assert hasattr(cuda.core.experimental, "Stream")
     assert hasattr(cuda.core.experimental, "Buffer")
+    assert hasattr(cuda.core.experimental, "managed_memory")
     assert hasattr(cuda.core.experimental, "system")
 
     # Test 2: Direct imports - should emit deprecation warning
@@ -73,6 +74,7 @@ def test_experimental_backward_compatibility():
     assert cuda.core.experimental.Linker is cuda.core.Linker
 
     # Compare singletons
+    assert cuda.core.experimental.managed_memory is cuda.core.managed_memory
     assert cuda.core.experimental.system is cuda.core.system
 
     # Test 4: Utils module works
@@ -88,6 +90,11 @@ def test_experimental_backward_compatibility():
 
     assert StridedMemoryView is not None
     assert args_viewable_as_strided_memory is not None
+    from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch
+
+    assert advise is not None
+    assert prefetch is not None
+    assert discard_prefetch is not None
 
     # Test 5: Options classes are accessible
     assert hasattr(cuda.core.experimental, "EventOptions")
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 380b581e7b..927014826a 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -38,6 +38,7 @@
     PinnedMemoryResourceOptions,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
+    managed_memory,
 )
 from cuda.core import (
     system as ccx_system,
@@ -48,6 +49,12 @@
 from cuda.core.utils import StridedMemoryView
 
 POOL_SIZE = 2097152  # 2MB size
+_MANAGED_TEST_ALLOCATION_SIZE = 4096
+_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4
+_READ_MOSTLY_ENABLED = 1
+_HOST_LOCATION_ID = -1
+_INVALID_HOST_DEVICE_ORDINAL = 0
+_LEGACY_BINDINGS_VERSION = (12, 9)
 
 
 class DummyDeviceMemoryResource(MemoryResource):
@@ -1138,6 +1145,10 @@ def _get_mem_range_attr(buffer, attribute, data_size):
     return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size))
 
 
+def _get_int_mem_range_attr(buffer, attribute):
+    return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE)
+
+
 def _skip_if_managed_allocation_unsupported(device):
     try:
         if not device.properties.managed_memory:
@@ -1165,140 +1176,134 @@ def _skip_if_managed_discard_prefetch_unsupported(device):
         pytest.skip("discard-prefetch requires concurrent managed access on all visible devices")
 
 
-def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda):
+def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
     device = Device()
     skip_if_managed_memory_unsupported(device)
     device.set_current()
 
     mr = create_managed_memory_resource_or_skip()
-    buffer = mr.allocate(4096)
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    buffer.prefetch(-1, stream=stream)
+    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
     stream.sync()
-    last_location = _get_mem_range_attr(
+    last_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
-        4,
     )
-    assert last_location == -1
+    assert last_location == _HOST_LOCATION_ID
 
-    buffer.prefetch(device, stream=stream)
+    managed_memory.prefetch(buffer, device, stream=stream)
     stream.sync()
-    last_location = _get_mem_range_attr(
+    last_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
-        4,
     )
     assert last_location == device.device_id
 
     buffer.close()
 
 
-def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda):
+def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
     device = Device()
     _skip_if_managed_allocation_unsupported(device)
     device.set_current()
 
-    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
 
-    buffer.advise("set_read_mostly")
+    managed_memory.advise(buffer, "set_read_mostly")
     assert (
-        _get_mem_range_attr(
+        _get_int_mem_range_attr(
             buffer,
             driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
-            4,
         )
-        == 1
+        == _READ_MOSTLY_ENABLED
     )
 
     # cuda.bindings currently exposes the combined location attributes for
     # cuMemRangeGetAttribute, so use the legacy location query here.
-    buffer.advise("set_preferred_location", location_type="host")
-    preferred_location = _get_mem_range_attr(
+    managed_memory.advise(buffer, "set_preferred_location", location_type="host")
+    preferred_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
-        4,
     )
-    assert preferred_location == -1
+    assert preferred_location == _HOST_LOCATION_ID
 
     buffer.close()
 
 
-def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda):
+def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda):
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
 
-    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    buffer.prefetch(device, stream=stream)
+    managed_memory.prefetch(buffer, device, stream=stream)
     stream.sync()
 
-    last_location = _get_mem_range_attr(
+    last_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
-        4,
     )
     assert last_location == device.device_id
 
     buffer.close()
 
 
-def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda):
+def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda):
     device = Device()
     _skip_if_managed_discard_prefetch_unsupported(device)
     device.set_current()
 
-    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    buffer.prefetch(-1, stream=stream)
+    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
     stream.sync()
 
-    buffer.discard_prefetch(device, stream=stream)
+    managed_memory.discard_prefetch(buffer, device, stream=stream)
     stream.sync()
 
-    last_location = _get_mem_range_attr(
+    last_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
-        4,
     )
     assert last_location == device.device_id
 
     buffer.close()
 
 
-def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda):
+def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda):
     device = Device()
     _skip_if_managed_allocation_unsupported(device)
     device.set_current()
 
-    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     calls = []
 
     def fake_cuMemAdvise(ptr, size, advice, location):
         calls.append((ptr, size, advice, location))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9))
+    monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
     monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise)
 
-    buffer.advise("set_read_mostly")
+    managed_memory.advise(buffer, "set_read_mostly")
 
     assert len(calls) == 1
-    assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1))
+    assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID))
 
     buffer.close()
 
 
-def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda):
+def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda):
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
 
-    buffer = DummyUnifiedMemoryResource(device).allocate(4096)
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
     calls = []
 
@@ -1306,10 +1311,10 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
         calls.append((ptr, size, location, hstream))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9))
+    monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
     monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
 
-    buffer.prefetch(device, stream=stream)
+    managed_memory.prefetch(buffer, device, stream=stream)
 
     assert len(calls) == 1
     assert calls[0][2] == device.device_id
@@ -1318,38 +1323,66 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
     buffer.close()
 
 
-def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda):
+def test_managed_memory_operations_reject_non_managed_allocations(init_cuda):
     device = Device()
     device.set_current()
 
-    buffer = DummyDeviceMemoryResource(device).allocate(4096)
+    buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    with pytest.raises(ValueError, match="managed-memory buffer"):
-        buffer.advise("set_read_mostly")
-    with pytest.raises(ValueError, match="managed-memory buffer"):
-        buffer.prefetch(device, stream=stream)
-    with pytest.raises(ValueError, match="managed-memory buffer"):
-        buffer.discard_prefetch(device, stream=stream)
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        managed_memory.advise(buffer, "set_read_mostly")
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        managed_memory.prefetch(buffer, device, stream=stream)
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        managed_memory.discard_prefetch(buffer, device, stream=stream)
 
     buffer.close()
 
 
-def test_managed_buffer_operation_validation(init_cuda):
+def test_managed_memory_operation_validation(init_cuda):
     device = Device()
     skip_if_managed_memory_unsupported(device)
     device.set_current()
 
     mr = create_managed_memory_resource_or_skip()
-    buffer = mr.allocate(4096)
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
     with pytest.raises(ValueError, match="requires a location"):
-        buffer.prefetch(stream=stream)
+        managed_memory.prefetch(buffer, stream=stream)
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
-        buffer.advise("set_accessed_by", 0, location_type="host_numa")
+        managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa")
     with pytest.raises(ValueError, match="location must be None or -1"):
-        buffer.prefetch(0, stream=stream, location_type="host")
+        managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host")
+
+    buffer.close()
+
+
+def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda):
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size)
+    assert (
+        _get_int_mem_range_attr(
+            buffer,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        )
+        == _READ_MOSTLY_ENABLED
+    )
+
+    managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream)
+    stream.sync()
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
 
     buffer.close()
 

From 14575991d65ca85973a4f1dc61f068efc4fc3293 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 16:46:20 -0700
Subject: [PATCH 07/31] precommit format

---
 cuda_core/cuda/core/managed_memory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
index f11aabcd19..f5bb09c13d 100644
--- a/cuda_core/cuda/core/managed_memory.py
+++ b/cuda_core/cuda/core/managed_memory.py
@@ -6,4 +6,4 @@
 
 from cuda.core._memory._buffer import advise, discard_prefetch, prefetch
 
-__all__ = ["advise", "prefetch", "discard_prefetch"]
+__all__ = ["advise", "discard_prefetch", "prefetch"]

From acb402478cac58689f069e0836819b2e91010c09 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 17:30:41 -0700
Subject: [PATCH 08/31] iterating on implementation

---
 cuda_bindings/pixi.lock                 | 86 ++++++++++++-------------
 cuda_core/cuda/core/_memory/_buffer.pyx | 63 ++++++++++++++----
 cuda_core/tests/test_memory.py          | 85 ++++++++++++++++++++++++
 3 files changed, 178 insertions(+), 56 deletions(-)

diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock
index b01d6eec69..237a169580 100644
--- a/cuda_bindings/pixi.lock
+++ b/cuda_bindings/pixi.lock
@@ -1081,21 +1081,21 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_16.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.2.20-h7938cbb_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-12.9.27-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-12.9.79-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-12.9.79-h3f2d84a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-12.9.79-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-12.9.79-h3f2d84a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-12.9.86-h69a702a_6.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.3-py314h1807b08_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda
@@ -1134,7 +1134,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
@@ -1160,8 +1160,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-12.9.82-hecca717_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.9.86-hecca717_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2025.2.0-hb617929_1.conda
@@ -1264,7 +1264,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
       - conda: .
-        build: py314hb727236_0
+        build: py314ha6d028f_0
       - conda: ../cuda_pathfinder
       linux-aarch64:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2
@@ -1460,21 +1460,21 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/conda-gcc-specs-15.2.0-hd546029_16.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-12.9.86-h57928b3_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-12.9.79-he0c23c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-12.9.86-hac47afa_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-12.9.86-h719f0c7_6.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-13.2.27-h57928b3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-13.2.51-h57928b3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.2.51-h719f0c7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.2.51-h57928b3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.2.51-h2466b09_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.2.51-h2466b09_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.2.20-h57928b3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.3-py314h344ed54_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda
@@ -1520,8 +1520,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_2.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-h2466b09_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-12.9.82-hac47afa_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-12.9.86-hac47afa_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.2.51-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.2.51-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6-h6a83c73_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.53-h7351971_0.conda
@@ -1583,7 +1583,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda
       - conda: .
-        build: py314h5e6f764_0
+        build: py314h356c398_0
       - conda: ../cuda_pathfinder
 packages:
 - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -2154,7 +2154,7 @@ packages:
   subdir: win-64
   variants:
     c_compiler: vs2022
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     cxx_compiler: vs2022
     python: 3.14.*
     target_platform: win-64
@@ -2182,7 +2182,7 @@ packages:
   subdir: win-64
   variants:
     c_compiler: vs2022
-    cuda-version: 12.*
+    cuda_version: 12.*
     cxx_compiler: vs2022
     python: 3.14.*
     target_platform: win-64
@@ -2209,7 +2209,7 @@ packages:
   build: py314h9a28ecd_0
   subdir: linux-aarch64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-aarch64
   depends:
@@ -2237,7 +2237,7 @@ packages:
   build: py314ha6d028f_0
   subdir: linux-64
   variants:
-    cuda-version: 12.*
+    cuda_version: 12.*
     python: 3.14.*
     target_platform: linux-64
   depends:
@@ -2265,7 +2265,7 @@ packages:
   build: py314hb727236_0
   subdir: linux-64
   variants:
-    cuda-version: 13.2.*
+    cuda_version: 13.2.*
     python: 3.14.*
     target_platform: linux-64
   depends:
@@ -2293,7 +2293,7 @@ packages:
   build: py314he8946ed_0
   subdir: linux-aarch64
   variants:
-    cuda-version: 12.*
+    cuda_version: 12.*
     python: 3.14.*
     target_platform: linux-aarch64
   depends:
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 4663302b34..829e05b3ad 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -205,9 +205,11 @@ cdef inline object _normalize_managed_location(
             )
         loc_id = <int>location
         if loc_id == -1:
-            loc_type = "host"
+            if not allow_host:
+                raise ValueError(f"{what} does not support host locations")
+            return _make_managed_location("host", -1)
         elif loc_id >= 0:
-            loc_type = "device"
+            return _make_managed_location("device", loc_id)
         else:
             raise ValueError(
                 f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}"
@@ -245,23 +247,22 @@ cdef inline object _normalize_managed_location(
             )
         return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID)
 
-    if loc_type == "host" and not allow_host:
-        raise ValueError(f"{what} does not support host locations")
-    if loc_type == "host_numa" and not allow_host_numa:
-        raise ValueError(f"{what} does not support location_type='host_numa'")
-    if loc_type == "host_numa_current" and not allow_host_numa_current:
-        raise ValueError(f"{what} does not support location_type='host_numa_current'")
-    return _make_managed_location(<str>loc_type, loc_id)
-
 
 cdef inline bint _managed_location_uses_v2_bindings():
     # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
     return get_binding_version() >= (13, 0)
 
 
+cdef object _LEGACY_LOC_DEVICE = None
+cdef object _LEGACY_LOC_HOST = None
+
 cdef inline int _managed_location_to_legacy_device(object location, str what):
+    global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST
+    if _LEGACY_LOC_DEVICE is None:
+        _LEGACY_LOC_DEVICE = _managed_location_enum("device")
+        _LEGACY_LOC_HOST = _managed_location_enum("host")
     cdef object loc_type = location.type
-    if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"):
+    if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST:
         return <int>location.id
     raise RuntimeError(
         f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}"
@@ -396,7 +397,25 @@ def prefetch(
     int size=_MANAGED_SIZE_NOT_PROVIDED,
     location_type: str | None = None,
 ):
-    """Prefetch a managed-memory allocation range to a target location."""
+    """Prefetch a managed-memory allocation range to a target location.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None``.
+        A location is required for prefetch.
+    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
+        Keyword argument specifying the stream for the asynchronous prefetch.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
     cdef Stream s = Stream_accept(stream)
     cdef object ptr
     cdef size_t nbytes
@@ -440,7 +459,25 @@ def discard_prefetch(
     int size=_MANAGED_SIZE_NOT_PROVIDED,
     location_type: str | None = None,
 ):
-    """Discard a managed-memory allocation range and prefetch it to a target location."""
+    """Discard a managed-memory allocation range and prefetch it to a target location.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None``.
+        A location is required for discard_prefetch.
+    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
+        Keyword argument specifying the stream for the asynchronous operation.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
     cdef Stream s = Stream_accept(stream)
     cdef object ptr
     cdef object batch_ptr
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 927014826a..ea827818ac 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1359,6 +1359,91 @@ def test_managed_memory_operation_validation(init_cuda):
     buffer.close()
 
 
+def test_managed_memory_advise_location_validation(init_cuda):
+    """Verify doc-specified location constraints for each advice kind."""
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    # set_read_mostly works without a location (location is ignored)
+    managed_memory.advise(buffer, "set_read_mostly")
+
+    # set_preferred_location requires a location; device ordinal works
+    managed_memory.advise(buffer, "set_preferred_location", device.device_id)
+
+    # set_preferred_location with host location_type
+    managed_memory.advise(buffer, "set_preferred_location", location_type="host")
+
+    # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs)
+    with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
+        managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa")
+
+    # set_accessed_by with host_numa_current also raises ValueError
+    with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"):
+        managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current")
+
+    # Inferred location from int: -1 maps to host, 0 maps to device
+    managed_memory.advise(buffer, "set_preferred_location", -1)
+    managed_memory.advise(buffer, "set_preferred_location", 0)
+
+    buffer.close()
+
+
+def test_managed_memory_advise_accepts_enum_value(init_cuda):
+    """advise() accepts CUmem_advise enum values directly, not just string aliases."""
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY
+    managed_memory.advise(buffer, advice_enum)
+
+    assert (
+        _get_int_mem_range_attr(
+            buffer,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        )
+        == _READ_MOSTLY_ENABLED
+    )
+
+    buffer.close()
+
+
+def test_managed_memory_advise_size_rejected_for_buffer(init_cuda):
+    """advise() raises TypeError when size= is given with a Buffer target."""
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    with pytest.raises(TypeError, match="does not accept size="):
+        managed_memory.advise(buffer, "set_read_mostly", size=1024)
+
+    buffer.close()
+
+
+def test_managed_memory_advise_invalid_advice_values(init_cuda):
+    """advise() rejects invalid advice strings and wrong types."""
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    with pytest.raises(ValueError, match="advice must be one of"):
+        managed_memory.advise(buffer, "not_a_real_advice")
+
+    with pytest.raises(TypeError, match="advice must be"):
+        managed_memory.advise(buffer, 42)
+
+    buffer.close()
+
+
 def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda):
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)

From d10ab07e2f402628b83b08e07d95da39c4f2b634 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 17 Mar 2026 18:13:36 -0700
Subject: [PATCH 09/31] Simplify managed-memory helpers: remove long-form
 aliases, cache lookups, fix docs

- Remove duplicate long-form "cu_mem_advise_*" string aliases from
  _MANAGED_ADVICE_ALIASES; users pass short strings or the enum directly
- Replace 4 boolean allow_* params in _normalize_managed_location with a
  single allowed_loctypes frozenset driven by _MANAGED_ADVICE_ALLOWED_LOCTYPES
- Cache immutable runtime checks: CU_DEVICE_CPU, v2 bindings flag,
  discard_prefetch support, and advice enum-to-alias reverse map
- Collapse hasattr+getattr to single getattr in _managed_location_enum
- Move _require_managed_discard_prefetch_support to top of discard_prefetch
  for fail-fast behavior
- Fix docs build: reset Sphinx module scope after managed_memory section in
  api.rst so subsequent sections resolve under cuda.core
- Add discard_prefetch pool-allocation test and comment on _get_mem_range_attr

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyx | 94 ++++++++++++++-----------
 cuda_core/docs/source/api.rst           |  2 +
 cuda_core/tests/test_memory.py          | 26 +++++++
 3 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 829e05b3ad..d280b4ea2b 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -89,17 +89,11 @@ cdef dict _MANAGED_LOCATION_TYPE_ATTRS = {
 
 cdef dict _MANAGED_ADVICE_ALIASES = {
     "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
-    "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
     "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
-    "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
     "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
-    "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
     "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
-    "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
     "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
-    "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
     "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
-    "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
 }
 
 cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
@@ -108,10 +102,18 @@ cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
     "unset_preferred_location",
 ))
 
-cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset((
-    "set_accessed_by",
-    "unset_accessed_by",
-))
+cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
+cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
+cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
+
+cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
+    "set_read_mostly": _DEVICE_HOST_NUMA,
+    "unset_read_mostly": _DEVICE_HOST_NUMA,
+    "set_preferred_location": _ALL_LOCATION_TYPES,
+    "unset_preferred_location": _DEVICE_HOST_NUMA,
+    "set_accessed_by": _DEVICE_HOST_ONLY,
+    "unset_accessed_by": _DEVICE_HOST_ONLY,
+}
 
 cdef int _MANAGED_SIZE_NOT_PROVIDED = -1
 cdef int _HOST_NUMA_CURRENT_ID = 0
@@ -120,22 +122,32 @@ cdef size_t _SINGLE_RANGE_COUNT = 1
 cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1
 cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
 
+# Lazily cached values for immutable runtime properties.
+cdef object _CU_DEVICE_CPU = None
+cdef dict _ADVICE_ENUM_TO_ALIAS = None
+cdef int _V2_BINDINGS = -1
+cdef int _DISCARD_PREFETCH_SUPPORTED = -1
+
 
 cdef inline object _managed_location_enum(str location_type):
     cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
-    if not hasattr(driver.CUmemLocationType, attr_name):
+    cdef object result = getattr(driver.CUmemLocationType, attr_name, None)
+    if result is None:
         raise RuntimeError(
             f"Managed-memory location type {location_type!r} is not supported by the "
             f"installed cuda.bindings package."
         )
-    return getattr(driver.CUmemLocationType, attr_name)
+    return result
 
 
 cdef inline object _make_managed_location(str location_type, int location_id):
+    global _CU_DEVICE_CPU
     cdef object location = driver.CUmemLocation()
     location.type = _managed_location_enum(location_type)
     if location_type == "host":
-        location.id = int(getattr(driver, "CU_DEVICE_CPU", -1))
+        if _CU_DEVICE_CPU is None:
+            _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1))
+        location.id = _CU_DEVICE_CPU
     elif location_type == "host_numa_current":
         location.id = _HOST_NUMA_CURRENT_ID
     else:
@@ -157,12 +169,17 @@ cdef inline tuple _normalize_managed_advice(object advice):
         return alias, getattr(driver.CUmem_advise, attr_name)
 
     if isinstance(advice, driver.CUmem_advise):
-        for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
-            if alias.startswith("cu_mem_advise_"):
-                continue
-            if advice == getattr(driver.CUmem_advise, attr_name):
-                return alias, advice
-        raise ValueError(f"Unsupported advice value: {advice!r}")
+        global _ADVICE_ENUM_TO_ALIAS
+        if _ADVICE_ENUM_TO_ALIAS is None:
+            _ADVICE_ENUM_TO_ALIAS = {}
+            for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
+                enum_val = getattr(driver.CUmem_advise, attr_name, None)
+                if enum_val is not None:
+                    _ADVICE_ENUM_TO_ALIAS[enum_val] = alias
+        alias = _ADVICE_ENUM_TO_ALIAS.get(advice)
+        if alias is None:
+            raise ValueError(f"Unsupported advice value: {advice!r}")
+        return alias, advice
 
     raise TypeError(
         "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias"
@@ -174,9 +191,7 @@ cdef inline object _normalize_managed_location(
     object location_type,
     str what,
     bint allow_none=False,
-    bint allow_host=True,
-    bint allow_host_numa=True,
-    bint allow_host_numa_current=True,
+    frozenset allowed_loctypes=_ALL_LOCATION_TYPES,
 ):
     cdef object loc_type
     cdef int loc_id
@@ -194,6 +209,9 @@ cdef inline object _normalize_managed_location(
             f"or None, got {location_type!r}"
         )
 
+    if loc_type is not None and loc_type not in allowed_loctypes:
+        raise ValueError(f"{what} does not support location_type='{loc_type}'")
+
     if loc_type is None:
         if location is None:
             if allow_none:
@@ -205,7 +223,7 @@ cdef inline object _normalize_managed_location(
             )
         loc_id = <int>location
         if loc_id == -1:
-            if not allow_host:
+            if "host" not in allowed_loctypes:
                 raise ValueError(f"{what} does not support host locations")
             return _make_managed_location("host", -1)
         elif loc_id >= 0:
@@ -227,20 +245,14 @@ cdef inline object _normalize_managed_location(
             raise ValueError(
                 f"{what} location must be None or -1 when location_type is 'host', got {location!r}"
             )
-        if not allow_host:
-            raise ValueError(f"{what} does not support location_type='host'")
         return _make_managed_location(loc_type, -1)
     elif loc_type == "host_numa":
-        if not allow_host_numa:
-            raise ValueError(f"{what} does not support location_type='host_numa'")
         if not isinstance(location, int) or <int>location < 0:
             raise ValueError(
                 f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}"
             )
         return _make_managed_location(loc_type, <int>location)
     else:
-        if not allow_host_numa_current:
-            raise ValueError(f"{what} does not support location_type='host_numa_current'")
         if location is not None:
             raise ValueError(
                 f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
@@ -250,7 +262,10 @@ cdef inline object _normalize_managed_location(
 
 cdef inline bint _managed_location_uses_v2_bindings():
     # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
-    return get_binding_version() >= (13, 0)
+    global _V2_BINDINGS
+    if _V2_BINDINGS < 0:
+        _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0
+    return _V2_BINDINGS != 0
 
 
 cdef object _LEGACY_LOC_DEVICE = None
@@ -276,7 +291,10 @@ cdef inline void _require_managed_buffer(Buffer self, str what):
 
 
 cdef inline void _require_managed_discard_prefetch_support(str what):
-    if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+    global _DISCARD_PREFETCH_SUPPORTED
+    if _DISCARD_PREFETCH_SUPPORTED < 0:
+        _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0
+    if not _DISCARD_PREFETCH_SUPPORTED:
         raise RuntimeError(
             f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync"
         )
@@ -372,9 +390,7 @@ def advise(
         location_type,
         "advise",
         allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
-        allow_host=True,
-        allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY,
-        allow_host_numa_current=advice_name == "set_preferred_location",
+        allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name],
     )
     if _managed_location_uses_v2_bindings():
         handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location))
@@ -425,10 +441,6 @@ def prefetch(
         location,
         location_type,
         "prefetch",
-        allow_none=False,
-        allow_host=True,
-        allow_host_numa=True,
-        allow_host_numa_current=True,
     )
     if _managed_location_uses_v2_bindings():
         handle_return(
@@ -478,6 +490,7 @@ def discard_prefetch(
         Explicit location kind. Supported values are ``"device"``, ``"host"``,
         ``"host_numa"``, and ``"host_numa_current"``.
     """
+    _require_managed_discard_prefetch_support("discard_prefetch")
     cdef Stream s = Stream_accept(stream)
     cdef object ptr
     cdef object batch_ptr
@@ -485,15 +498,10 @@ def discard_prefetch(
 
     ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
     batch_ptr = driver.CUdeviceptr(int(ptr))
-    _require_managed_discard_prefetch_support("discard_prefetch")
     location = _normalize_managed_location(
         location,
         location_type,
         "discard_prefetch",
-        allow_none=False,
-        allow_host=True,
-        allow_host_numa=True,
-        allow_host_numa_current=True,
     )
     handle_return(
         driver.cuMemDiscardAndPrefetchBatchAsync(
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 4d63bbcf88..7bf59ae495 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -74,6 +74,8 @@ Managed memory
    prefetch
    discard_prefetch
 
+.. module:: cuda.core
+   :no-index:
 
 CUDA compilation toolchain
 --------------------------
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index ea827818ac..5296ea344a 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1142,6 +1142,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
 
 
 def _get_mem_range_attr(buffer, attribute, data_size):
+    # cuMemRangeGetAttribute returns a raw integer when data_size <= 4.
     return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size))
 
 
@@ -1252,6 +1253,31 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda
     buffer.close()
 
 
+def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    _skip_if_managed_discard_prefetch_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    stream.sync()
+
+    managed_memory.discard_prefetch(buffer, device, stream=stream)
+    stream.sync()
+
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
 def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda):
     device = Device()
     _skip_if_managed_discard_prefetch_unsupported(device)

From c250c92e47393fa6cb0e6611245c5a4dd0c3b6cf Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 18 Mar 2026 09:21:11 -0700
Subject: [PATCH 10/31] fix(test): reset _V2_BINDINGS cache so legacy-signature
 tests take the legacy path

The _V2_BINDINGS cache in _buffer.pyx persists across tests, so
monkeypatching get_binding_version alone is insufficient when earlier
tests have already populated the cache with the v2 value. Promote
_V2_BINDINGS from cdef int to a Python-level variable so tests can
monkeypatch it directly via monkeypatch.setattr, and reset it to -1
in both legacy-signature tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyx | 2 +-
 cuda_core/tests/test_memory.py          | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 6f5809e06c..d109de2ac4 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -124,7 +124,7 @@ cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
 # Lazily cached values for immutable runtime properties.
 cdef object _CU_DEVICE_CPU = None
 cdef dict _ADVICE_ENUM_TO_ALIAS = None
-cdef int _V2_BINDINGS = -1
+_V2_BINDINGS = -1
 cdef int _DISCARD_PREFETCH_SUPPORTED = -1
 
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 9cd3209d8d..411a3c6cb5 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1314,6 +1314,7 @@ def fake_cuMemAdvise(ptr, size, advice, location):
         return (driver.CUresult.CUDA_SUCCESS,)
 
     monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1)
     monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise)
 
     managed_memory.advise(buffer, "set_read_mostly")
@@ -1338,6 +1339,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
         return (driver.CUresult.CUDA_SUCCESS,)
 
     monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1)
     monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
 
     managed_memory.prefetch(buffer, device, stream=stream)

From 89329d9c6eff581445b4806fe0217e598a2313fa Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 18 Mar 2026 10:18:41 -0700
Subject: [PATCH 11/31] fix(test): require concurrent_managed_access for advise
 tests that hit real hardware

These three tests call cuMemAdvise on real CUDA devices and verify
memory range attributes. On devices without concurrent_managed_access
(e.g. Windows/WDDM), set_read_mostly silently no-ops and
set_preferred_location fails with CUDA_ERROR_INVALID_DEVICE. Use the
stricter _skip_if_managed_location_ops_unsupported guard, matching the
pattern already used by test_managed_memory_functions_accept_raw_pointer_ranges.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/tests/test_memory.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 411a3c6cb5..56c505fbe6 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1207,7 +1207,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
 
 def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
     device = Device()
-    _skip_if_managed_allocation_unsupported(device)
+    _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
 
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
@@ -1390,7 +1390,7 @@ def test_managed_memory_operation_validation(init_cuda):
 def test_managed_memory_advise_location_validation(init_cuda):
     """Verify doc-specified location constraints for each advice kind."""
     device = Device()
-    _skip_if_managed_allocation_unsupported(device)
+    _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
 
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
@@ -1422,7 +1422,7 @@ def test_managed_memory_advise_location_validation(init_cuda):
 def test_managed_memory_advise_accepts_enum_value(init_cuda):
     """advise() accepts CUmem_advise enum values directly, not just string aliases."""
     device = Device()
-    _skip_if_managed_allocation_unsupported(device)
+    _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
 
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)

From 8a75d1bf1f1172e4681bb232a22f00ff9567d5d8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 18 Mar 2026 11:23:53 -0700
Subject: [PATCH 12/31] fix: validate managed buffer before checking
 discard_prefetch bindings support

Reorder checks in discard_prefetch so _normalize_managed_target_range
runs before _require_managed_discard_prefetch_support. This ensures
non-managed buffers raise ValueError before the RuntimeError for missing
cuMemDiscardAndPrefetchBatchAsync support.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index d109de2ac4..ffd82facb5 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -489,13 +489,13 @@ def discard_prefetch(
         Explicit location kind. Supported values are ``"device"``, ``"host"``,
         ``"host_numa"``, and ``"host_numa_current"``.
     """
-    _require_managed_discard_prefetch_support("discard_prefetch")
-    cdef Stream s = Stream_accept(stream)
     cdef object ptr
     cdef object batch_ptr
     cdef size_t nbytes
 
     ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
+    _require_managed_discard_prefetch_support("discard_prefetch")
+    cdef Stream s = Stream_accept(stream)
     batch_ptr = driver.CUdeviceptr(int(ptr))
     location = _normalize_managed_location(
         location,

From 9e9b1e0914d30f855389a349cf8d41d134b1c4dc Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 18 Mar 2026 14:08:24 -0700
Subject: [PATCH 13/31] refactor: extract managed memory ops into dedicated
 _managed_memory_ops module

Move advise, prefetch, and discard_prefetch functions and their helpers
out of _buffer.pyx into a new _managed_memory_ops Cython module to
improve separation of concerns. Expose _init_mem_attrs and
_query_memory_attrs as non-inline cdef functions in _buffer.pxd so the
new module can reuse them.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_buffer.pxd       |   8 +
 cuda_core/cuda/core/_memory/_buffer.pyx       | 449 +----------------
 .../cuda/core/_memory/_managed_memory_ops.pxd |   6 +
 .../cuda/core/_memory/_managed_memory_ops.pyx | 458 ++++++++++++++++++
 cuda_core/cuda/core/managed_memory.py         |   2 +-
 cuda_core/tests/test_memory.py                |  14 +-
 6 files changed, 483 insertions(+), 454 deletions(-)
 create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd
 create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx

diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index 04b5707e18..9065da77eb 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport DevicePtrHandle
 from cuda.core._stream cimport Stream
 
@@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle(
     MemoryResource mr,
     object ipc_descriptor = *
 )
+
+# Memory attribute query helpers (used by _managed_memory_ops)
+cdef void _init_mem_attrs(Buffer self)
+cdef int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr,
+) except -1 nogil
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index ffd82facb5..104252a62b 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -35,7 +35,7 @@ else:
     BufferProtocol = object
 
 from cuda.core._dlpack import DLDeviceType, make_py_capsule
-from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
+from cuda.core._utils.cuda_utils import driver, handle_return
 from cuda.core._device import Device
 
 
@@ -72,449 +72,6 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
 """
 
 
-cdef tuple _VALID_MANAGED_LOCATION_TYPES = (
-    "device",
-    "host",
-    "host_numa",
-    "host_numa_current",
-)
-
-cdef dict _MANAGED_LOCATION_TYPE_ATTRS = {
-    "device": "CU_MEM_LOCATION_TYPE_DEVICE",
-    "host": "CU_MEM_LOCATION_TYPE_HOST",
-    "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA",
-    "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
-}
-
-cdef dict _MANAGED_ADVICE_ALIASES = {
-    "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
-    "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
-    "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
-    "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
-    "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
-    "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
-}
-
-cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
-    "set_read_mostly",
-    "unset_read_mostly",
-    "unset_preferred_location",
-))
-
-cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
-cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
-cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
-
-cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
-    "set_read_mostly": _DEVICE_HOST_NUMA,
-    "unset_read_mostly": _DEVICE_HOST_NUMA,
-    "set_preferred_location": _ALL_LOCATION_TYPES,
-    "unset_preferred_location": _DEVICE_HOST_NUMA,
-    "set_accessed_by": _DEVICE_HOST_ONLY,
-    "unset_accessed_by": _DEVICE_HOST_ONLY,
-}
-
-cdef int _MANAGED_SIZE_NOT_PROVIDED = -1
-cdef int _HOST_NUMA_CURRENT_ID = 0
-cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0
-cdef size_t _SINGLE_RANGE_COUNT = 1
-cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1
-cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
-
-# Lazily cached values for immutable runtime properties.
-cdef object _CU_DEVICE_CPU = None
-cdef dict _ADVICE_ENUM_TO_ALIAS = None
-_V2_BINDINGS = -1
-cdef int _DISCARD_PREFETCH_SUPPORTED = -1
-
-
-cdef inline object _managed_location_enum(str location_type):
-    cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
-    cdef object result = getattr(driver.CUmemLocationType, attr_name, None)
-    if result is None:
-        raise RuntimeError(
-            f"Managed-memory location type {location_type!r} is not supported by the "
-            f"installed cuda.bindings package."
-        )
-    return result
-
-
-cdef inline object _make_managed_location(str location_type, int location_id):
-    global _CU_DEVICE_CPU
-    cdef object location = driver.CUmemLocation()
-    location.type = _managed_location_enum(location_type)
-    if location_type == "host":
-        if _CU_DEVICE_CPU is None:
-            _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1))
-        location.id = _CU_DEVICE_CPU
-    elif location_type == "host_numa_current":
-        location.id = _HOST_NUMA_CURRENT_ID
-    else:
-        location.id = location_id
-    return location
-
-
-cdef inline tuple _normalize_managed_advice(object advice):
-    cdef str alias
-    cdef str attr_name
-    if isinstance(advice, str):
-        alias = advice.lower()
-        attr_name = _MANAGED_ADVICE_ALIASES.get(alias)
-        if attr_name is None:
-            raise ValueError(
-                "advice must be one of "
-                f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}"
-            )
-        return alias, getattr(driver.CUmem_advise, attr_name)
-
-    if isinstance(advice, driver.CUmem_advise):
-        global _ADVICE_ENUM_TO_ALIAS
-        if _ADVICE_ENUM_TO_ALIAS is None:
-            _ADVICE_ENUM_TO_ALIAS = {}
-            for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
-                enum_val = getattr(driver.CUmem_advise, attr_name, None)
-                if enum_val is not None:
-                    _ADVICE_ENUM_TO_ALIAS[enum_val] = alias
-        alias = _ADVICE_ENUM_TO_ALIAS.get(advice)
-        if alias is None:
-            raise ValueError(f"Unsupported advice value: {advice!r}")
-        return alias, advice
-
-    raise TypeError(
-        "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias"
-    )
-
-
-cdef inline object _normalize_managed_location(
-    object location,
-    object location_type,
-    str what,
-    bint allow_none=False,
-    frozenset allowed_loctypes=_ALL_LOCATION_TYPES,
-):
-    cdef object loc_type
-    cdef int loc_id
-
-    if isinstance(location, Device):
-        location = location.device_id
-
-    if location_type is not None and not isinstance(location_type, str):
-        raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}")
-
-    loc_type = None if location_type is None else (<str>location_type).lower()
-    if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES:
-        raise ValueError(
-            f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} "
-            f"or None, got {location_type!r}"
-        )
-
-    if loc_type is not None and loc_type not in allowed_loctypes:
-        raise ValueError(f"{what} does not support location_type='{loc_type}'")
-
-    if loc_type is None:
-        if location is None:
-            if allow_none:
-                return _make_managed_location("host", -1)
-            raise ValueError(f"{what} requires a location")
-        if not isinstance(location, int):
-            raise TypeError(
-                f"{what} location must be a Device, int, or None, got {type(location).__name__}"
-            )
-        loc_id = <int>location
-        if loc_id == -1:
-            if "host" not in allowed_loctypes:
-                raise ValueError(f"{what} does not support host locations")
-            return _make_managed_location("host", -1)
-        elif loc_id >= 0:
-            return _make_managed_location("device", loc_id)
-        else:
-            raise ValueError(
-                f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}"
-            )
-    elif loc_type == "device":
-        if isinstance(location, int) and <int>location >= 0:
-            loc_id = <int>location
-        else:
-            raise ValueError(
-                f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}"
-            )
-        return _make_managed_location(loc_type, loc_id)
-    elif loc_type == "host":
-        if location not in (None, -1):
-            raise ValueError(
-                f"{what} location must be None or -1 when location_type is 'host', got {location!r}"
-            )
-        return _make_managed_location(loc_type, -1)
-    elif loc_type == "host_numa":
-        if not isinstance(location, int) or <int>location < 0:
-            raise ValueError(
-                f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}"
-            )
-        return _make_managed_location(loc_type, <int>location)
-    else:
-        if location is not None:
-            raise ValueError(
-                f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
-            )
-        return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID)
-
-
-cdef inline bint _managed_location_uses_v2_bindings():
-    # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
-    global _V2_BINDINGS
-    if _V2_BINDINGS < 0:
-        _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0
-    return _V2_BINDINGS != 0
-
-
-cdef object _LEGACY_LOC_DEVICE = None
-cdef object _LEGACY_LOC_HOST = None
-
-cdef inline int _managed_location_to_legacy_device(object location, str what):
-    global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST
-    if _LEGACY_LOC_DEVICE is None:
-        _LEGACY_LOC_DEVICE = _managed_location_enum("device")
-        _LEGACY_LOC_HOST = _managed_location_enum("host")
-    cdef object loc_type = location.type
-    if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST:
-        return <int>location.id
-    raise RuntimeError(
-        f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}"
-    )
-
-
-cdef inline void _require_managed_buffer(Buffer self, str what):
-    _init_mem_attrs(self)
-    if not self._mem_attrs.is_managed:
-        raise ValueError(f"{what} requires a managed-memory allocation")
-
-
-cdef inline void _require_managed_discard_prefetch_support(str what):
-    global _DISCARD_PREFETCH_SUPPORTED
-    if _DISCARD_PREFETCH_SUPPORTED < 0:
-        _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0
-    if not _DISCARD_PREFETCH_SUPPORTED:
-        raise RuntimeError(
-            f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync"
-        )
-
-
-cdef inline tuple _managed_range_from_buffer(
-    Buffer buffer,
-    int size,
-    str what,
-):
-    if size != _MANAGED_SIZE_NOT_PROVIDED:
-        raise TypeError(f"{what} does not accept size= when target is a Buffer")
-    _require_managed_buffer(buffer, what)
-    return buffer.handle, buffer._size
-
-
-cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0:
-    cdef object ptr_obj
-    try:
-        ptr_obj = int(target)
-    except Exception as exc:
-        raise TypeError(
-            f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}"
-        ) from exc
-    if ptr_obj < 0:
-        raise ValueError(f"{what} target pointer must be >= 0, got {target!r}")
-    return <uintptr_t>ptr_obj
-
-
-cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1:
-    cdef _MemAttrs mem_attrs
-    with nogil:
-        _query_memory_attrs(mem_attrs, <cydriver.CUdeviceptr>ptr)
-    if not mem_attrs.is_managed:
-        raise ValueError(f"{what} requires a managed-memory allocation")
-    return 0
-
-
-cdef inline tuple _normalize_managed_target_range(
-    object target,
-    int size,
-    str what,
-):
-    cdef uintptr_t ptr
-
-    if isinstance(target, Buffer):
-        return _managed_range_from_buffer(<Buffer>target, size, what)
-
-    if size == _MANAGED_SIZE_NOT_PROVIDED:
-        raise TypeError(f"{what} requires size= when target is a raw pointer")
-    ptr = _coerce_raw_pointer(target, what)
-    _require_managed_pointer(ptr, what)
-    return ptr, <size_t>size
-
-
-def advise(
-    target,
-    advice: driver.CUmem_advise | str,
-    location: Device | int | None = None,
-    *,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
-    location_type: str | None = None,
-):
-    """Apply managed-memory advice to an allocation range.
-
-    Parameters
-    ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
-    advice : :obj:`~driver.CUmem_advise` | str
-        Managed-memory advice to apply. String aliases such as
-        ``"set_read_mostly"``, ``"set_preferred_location"``, and
-        ``"set_accessed_by"`` are accepted.
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None`` for
-        advice values that ignore location.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
-    """
-    cdef str advice_name
-    cdef object ptr
-    cdef size_t nbytes
-
-    ptr, nbytes = _normalize_managed_target_range(target, size, "advise")
-    advice_name, advice = _normalize_managed_advice(advice)
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "advise",
-        allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
-        allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name],
-    )
-    if _managed_location_uses_v2_bindings():
-        handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location))
-    else:
-        handle_return(
-            driver.cuMemAdvise(
-                ptr,
-                nbytes,
-                advice,
-                _managed_location_to_legacy_device(location, "advise"),
-            )
-        )
-
-
-def prefetch(
-    target,
-    location: Device | int | None = None,
-    *,
-    stream: Stream | GraphBuilder,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
-    location_type: str | None = None,
-):
-    """Prefetch a managed-memory allocation range to a target location.
-
-    Parameters
-    ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None``.
-        A location is required for prefetch.
-    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
-        Keyword argument specifying the stream for the asynchronous prefetch.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
-    """
-    cdef Stream s = Stream_accept(stream)
-    cdef object ptr
-    cdef size_t nbytes
-
-    ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch")
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "prefetch",
-    )
-    if _managed_location_uses_v2_bindings():
-        handle_return(
-            driver.cuMemPrefetchAsync(
-                ptr,
-                nbytes,
-                location,
-                _MANAGED_OPERATION_FLAGS,
-                s.handle,
-            )
-        )
-    else:
-        handle_return(
-            driver.cuMemPrefetchAsync(
-                ptr,
-                nbytes,
-                _managed_location_to_legacy_device(location, "prefetch"),
-                s.handle,
-            )
-        )
-
-
-def discard_prefetch(
-    target,
-    location: Device | int | None = None,
-    *,
-    stream: Stream | GraphBuilder,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
-    location_type: str | None = None,
-):
-    """Discard a managed-memory allocation range and prefetch it to a target location.
-
-    Parameters
-    ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None``.
-        A location is required for discard_prefetch.
-    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
-        Keyword argument specifying the stream for the asynchronous operation.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
-    """
-    cdef object ptr
-    cdef object batch_ptr
-    cdef size_t nbytes
-
-    ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
-    _require_managed_discard_prefetch_support("discard_prefetch")
-    cdef Stream s = Stream_accept(stream)
-    batch_ptr = driver.CUdeviceptr(int(ptr))
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "discard_prefetch",
-    )
-    handle_return(
-        driver.cuMemDiscardAndPrefetchBatchAsync(
-            [batch_ptr],
-            [nbytes],
-            _SINGLE_RANGE_COUNT,
-            [location],
-            [_FIRST_PREFETCH_LOCATION_INDEX],
-            _SINGLE_PREFETCH_LOCATION_COUNT,
-            _MANAGED_OPERATION_FLAGS,
-            s.handle,
-        )
-    )
-
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -864,14 +421,14 @@ cdef class Buffer:
 
 # Memory Attribute Query Helpers
 # ------------------------------
-cdef inline void _init_mem_attrs(Buffer self):
+cdef void _init_mem_attrs(Buffer self):
     """Initialize memory attributes by querying the pointer."""
     if not self._mem_attrs_inited:
         _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr))
         self._mem_attrs_inited = True
 
 
-cdef inline int _query_memory_attrs(
+cdef int _query_memory_attrs(
     _MemAttrs& out,
     cydriver.CUdeviceptr ptr
 ) except -1 nogil:
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd
new file mode 100644
index 0000000000..a7019c784d
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Managed-memory operation helpers (advise, prefetch, discard_prefetch).
+# The public API is exposed via def functions; no cdef declarations needed.
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
new file mode 100644
index 0000000000..649c2cbe72
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -0,0 +1,458 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
+from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs
+from cuda.core._stream cimport Stream, Stream_accept
+
+from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
+from cuda.core._device import Device
+
+
+cdef tuple _VALID_MANAGED_LOCATION_TYPES = (
+    "device",
+    "host",
+    "host_numa",
+    "host_numa_current",
+)
+
+cdef dict _MANAGED_LOCATION_TYPE_ATTRS = {
+    "device": "CU_MEM_LOCATION_TYPE_DEVICE",
+    "host": "CU_MEM_LOCATION_TYPE_HOST",
+    "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA",
+    "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
+}
+
+cdef dict _MANAGED_ADVICE_ALIASES = {
+    "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
+    "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+    "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+    "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+    "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
+    "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+}
+
+cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
+    "set_read_mostly",
+    "unset_read_mostly",
+    "unset_preferred_location",
+))
+
+cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
+cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
+cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
+
+cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
+    "set_read_mostly": _DEVICE_HOST_NUMA,
+    "unset_read_mostly": _DEVICE_HOST_NUMA,
+    "set_preferred_location": _ALL_LOCATION_TYPES,
+    "unset_preferred_location": _DEVICE_HOST_NUMA,
+    "set_accessed_by": _DEVICE_HOST_ONLY,
+    "unset_accessed_by": _DEVICE_HOST_ONLY,
+}
+
+cdef int _MANAGED_SIZE_NOT_PROVIDED = -1
+cdef int _HOST_NUMA_CURRENT_ID = 0
+cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0
+cdef size_t _SINGLE_RANGE_COUNT = 1
+cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1
+cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
+
+# Lazily cached values for immutable runtime properties.
+cdef object _CU_DEVICE_CPU = None
+cdef dict _ADVICE_ENUM_TO_ALIAS = None
+_V2_BINDINGS = -1
+cdef int _DISCARD_PREFETCH_SUPPORTED = -1
+
+
+cdef object _managed_location_enum(str location_type):
+    cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
+    cdef object result = getattr(driver.CUmemLocationType, attr_name, None)
+    if result is None:
+        raise RuntimeError(
+            f"Managed-memory location type {location_type!r} is not supported by the "
+            f"installed cuda.bindings package."
+        )
+    return result
+
+
+cdef object _make_managed_location(str location_type, int location_id):
+    global _CU_DEVICE_CPU
+    cdef object location = driver.CUmemLocation()
+    location.type = _managed_location_enum(location_type)
+    if location_type == "host":
+        if _CU_DEVICE_CPU is None:
+            _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1))
+        location.id = _CU_DEVICE_CPU
+    elif location_type == "host_numa_current":
+        location.id = _HOST_NUMA_CURRENT_ID
+    else:
+        location.id = location_id
+    return location
+
+
+cdef tuple _normalize_managed_advice(object advice):
+    cdef str alias
+    cdef str attr_name
+    if isinstance(advice, str):
+        alias = advice.lower()
+        attr_name = _MANAGED_ADVICE_ALIASES.get(alias)
+        if attr_name is None:
+            raise ValueError(
+                "advice must be one of "
+                f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}"
+            )
+        return alias, getattr(driver.CUmem_advise, attr_name)
+
+    if isinstance(advice, driver.CUmem_advise):
+        global _ADVICE_ENUM_TO_ALIAS
+        if _ADVICE_ENUM_TO_ALIAS is None:
+            _ADVICE_ENUM_TO_ALIAS = {}
+            for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
+                enum_val = getattr(driver.CUmem_advise, attr_name, None)
+                if enum_val is not None:
+                    _ADVICE_ENUM_TO_ALIAS[enum_val] = alias
+        alias = _ADVICE_ENUM_TO_ALIAS.get(advice)
+        if alias is None:
+            raise ValueError(f"Unsupported advice value: {advice!r}")
+        return alias, advice
+
+    raise TypeError(
+        "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias"
+    )
+
+
+cdef object _normalize_managed_location(
+    object location,
+    object location_type,
+    str what,
+    bint allow_none=False,
+    frozenset allowed_loctypes=_ALL_LOCATION_TYPES,
+):
+    cdef object loc_type
+    cdef int loc_id
+
+    if isinstance(location, Device):
+        location = location.device_id
+
+    if location_type is not None and not isinstance(location_type, str):
+        raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}")
+
+    loc_type = None if location_type is None else (<str>location_type).lower()
+    if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES:
+        raise ValueError(
+            f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} "
+            f"or None, got {location_type!r}"
+        )
+
+    if loc_type is not None and loc_type not in allowed_loctypes:
+        raise ValueError(f"{what} does not support location_type='{loc_type}'")
+
+    if loc_type is None:
+        if location is None:
+            if allow_none:
+                return _make_managed_location("host", -1)
+            raise ValueError(f"{what} requires a location")
+        if not isinstance(location, int):
+            raise TypeError(
+                f"{what} location must be a Device, int, or None, got {type(location).__name__}"
+            )
+        loc_id = <int>location
+        if loc_id == -1:
+            if "host" not in allowed_loctypes:
+                raise ValueError(f"{what} does not support host locations")
+            return _make_managed_location("host", -1)
+        elif loc_id >= 0:
+            return _make_managed_location("device", loc_id)
+        else:
+            raise ValueError(
+                f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}"
+            )
+    elif loc_type == "device":
+        if isinstance(location, int) and <int>location >= 0:
+            loc_id = <int>location
+        else:
+            raise ValueError(
+                f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}"
+            )
+        return _make_managed_location(loc_type, loc_id)
+    elif loc_type == "host":
+        if location not in (None, -1):
+            raise ValueError(
+                f"{what} location must be None or -1 when location_type is 'host', got {location!r}"
+            )
+        return _make_managed_location(loc_type, -1)
+    elif loc_type == "host_numa":
+        if not isinstance(location, int) or <int>location < 0:
+            raise ValueError(
+                f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}"
+            )
+        return _make_managed_location(loc_type, <int>location)
+    else:
+        if location is not None:
+            raise ValueError(
+                f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
+            )
+        return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID)
+
+
+cdef bint _managed_location_uses_v2_bindings():
+    # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
+    global _V2_BINDINGS
+    if _V2_BINDINGS < 0:
+        _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0
+    return _V2_BINDINGS != 0
+
+
+cdef object _LEGACY_LOC_DEVICE = None
+cdef object _LEGACY_LOC_HOST = None
+
+cdef int _managed_location_to_legacy_device(object location, str what):
+    global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST
+    if _LEGACY_LOC_DEVICE is None:
+        _LEGACY_LOC_DEVICE = _managed_location_enum("device")
+        _LEGACY_LOC_HOST = _managed_location_enum("host")
+    cdef object loc_type = location.type
+    if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST:
+        return <int>location.id
+    raise RuntimeError(
+        f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}"
+    )
+
+
+cdef void _require_managed_buffer(Buffer self, str what):
+    _init_mem_attrs(self)
+    if not self._mem_attrs.is_managed:
+        raise ValueError(f"{what} requires a managed-memory allocation")
+
+
+cdef void _require_managed_discard_prefetch_support(str what):
+    global _DISCARD_PREFETCH_SUPPORTED
+    if _DISCARD_PREFETCH_SUPPORTED < 0:
+        _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0
+    if not _DISCARD_PREFETCH_SUPPORTED:
+        raise RuntimeError(
+            f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync"
+        )
+
+
+cdef tuple _managed_range_from_buffer(
+    Buffer buffer,
+    int size,
+    str what,
+):
+    if size != _MANAGED_SIZE_NOT_PROVIDED:
+        raise TypeError(f"{what} does not accept size= when target is a Buffer")
+    _require_managed_buffer(buffer, what)
+    return buffer.handle, buffer._size
+
+
+cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0:
+    cdef object ptr_obj
+    try:
+        ptr_obj = int(target)
+    except Exception as exc:
+        raise TypeError(
+            f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}"
+        ) from exc
+    if ptr_obj < 0:
+        raise ValueError(f"{what} target pointer must be >= 0, got {target!r}")
+    return <uintptr_t>ptr_obj
+
+
+cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1:
+    cdef _MemAttrs mem_attrs
+    with nogil:
+        _query_memory_attrs(mem_attrs, <cydriver.CUdeviceptr>ptr)
+    if not mem_attrs.is_managed:
+        raise ValueError(f"{what} requires a managed-memory allocation")
+    return 0
+
+
+cdef tuple _normalize_managed_target_range(
+    object target,
+    int size,
+    str what,
+):
+    cdef uintptr_t ptr
+
+    if isinstance(target, Buffer):
+        return _managed_range_from_buffer(<Buffer>target, size, what)
+
+    if size == _MANAGED_SIZE_NOT_PROVIDED:
+        raise TypeError(f"{what} requires size= when target is a raw pointer")
+    ptr = _coerce_raw_pointer(target, what)
+    _require_managed_pointer(ptr, what)
+    return ptr, <size_t>size
+
+
+def advise(
+    target,
+    advice: driver.CUmem_advise | str,
+    location: Device | int | None = None,
+    *,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Apply managed-memory advice to an allocation range.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    advice : :obj:`~driver.CUmem_advise` | str
+        Managed-memory advice to apply. String aliases such as
+        ``"set_read_mostly"``, ``"set_preferred_location"``, and
+        ``"set_accessed_by"`` are accepted.
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None`` for
+        advice values that ignore location.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
+    cdef str advice_name
+    cdef object ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "advise")
+    advice_name, advice = _normalize_managed_advice(advice)
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "advise",
+        allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
+        allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name],
+    )
+    if _managed_location_uses_v2_bindings():
+        handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location))
+    else:
+        handle_return(
+            driver.cuMemAdvise(
+                ptr,
+                nbytes,
+                advice,
+                _managed_location_to_legacy_device(location, "advise"),
+            )
+        )
+
+
+def prefetch(
+    target,
+    location: Device | int | None = None,
+    *,
+    stream: Stream | GraphBuilder,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Prefetch a managed-memory allocation range to a target location.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None``.
+        A location is required for prefetch.
+    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
+        Keyword argument specifying the stream for the asynchronous prefetch.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
+    cdef Stream s = Stream_accept(stream)
+    cdef object ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch")
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "prefetch",
+    )
+    if _managed_location_uses_v2_bindings():
+        handle_return(
+            driver.cuMemPrefetchAsync(
+                ptr,
+                nbytes,
+                location,
+                _MANAGED_OPERATION_FLAGS,
+                s.handle,
+            )
+        )
+    else:
+        handle_return(
+            driver.cuMemPrefetchAsync(
+                ptr,
+                nbytes,
+                _managed_location_to_legacy_device(location, "prefetch"),
+                s.handle,
+            )
+        )
+
+
+def discard_prefetch(
+    target,
+    location: Device | int | None = None,
+    *,
+    stream: Stream | GraphBuilder,
+    int size=_MANAGED_SIZE_NOT_PROVIDED,
+    location_type: str | None = None,
+):
+    """Discard a managed-memory allocation range and prefetch it to a target location.
+
+    Parameters
+    ----------
+    target : :class:`Buffer` | int | object
+        Managed allocation to operate on. This may be a :class:`Buffer` or a
+        raw pointer (requires ``size=``).
+    location : :obj:`~_device.Device` | int | None, optional
+        Target location. When ``location_type`` is ``None``, values are
+        interpreted as a device ordinal, ``-1`` for host, or ``None``.
+        A location is required for discard_prefetch.
+    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
+        Keyword argument specifying the stream for the asynchronous operation.
+    size : int, optional
+        Allocation size in bytes. Required when ``target`` is a raw pointer.
+    location_type : str | None, optional
+        Explicit location kind. Supported values are ``"device"``, ``"host"``,
+        ``"host_numa"``, and ``"host_numa_current"``.
+    """
+    cdef object ptr
+    cdef object batch_ptr
+    cdef size_t nbytes
+
+    ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
+    _require_managed_discard_prefetch_support("discard_prefetch")
+    cdef Stream s = Stream_accept(stream)
+    batch_ptr = driver.CUdeviceptr(int(ptr))
+    location = _normalize_managed_location(
+        location,
+        location_type,
+        "discard_prefetch",
+    )
+    handle_return(
+        driver.cuMemDiscardAndPrefetchBatchAsync(
+            [batch_ptr],
+            [nbytes],
+            _SINGLE_RANGE_COUNT,
+            [location],
+            [_FIRST_PREFETCH_LOCATION_INDEX],
+            _SINGLE_PREFETCH_LOCATION_COUNT,
+            _MANAGED_OPERATION_FLAGS,
+            s.handle,
+        )
+    )
diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
index f5bb09c13d..005c9ec3cf 100644
--- a/cuda_core/cuda/core/managed_memory.py
+++ b/cuda_core/cuda/core/managed_memory.py
@@ -4,6 +4,6 @@
 
 """Managed-memory range operations."""
 
-from cuda.core._memory._buffer import advise, discard_prefetch, prefetch
+from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch
 
 __all__ = ["advise", "discard_prefetch", "prefetch"]
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 56c505fbe6..544b7afc03 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -44,7 +44,7 @@
     system as ccx_system,
 )
 from cuda.core._dlpack import DLDeviceType
-from cuda.core._memory import IPCBufferDescriptor, _buffer
+from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops
 from cuda.core._utils.cuda_utils import CUDAError, handle_return
 from cuda.core.utils import StridedMemoryView
 
@@ -1313,9 +1313,9 @@ def fake_cuMemAdvise(ptr, size, advice, location):
         calls.append((ptr, size, advice, location))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
-    monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1)
-    monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise)
+    monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
+    monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise)
 
     managed_memory.advise(buffer, "set_read_mostly")
 
@@ -1338,9 +1338,9 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
         calls.append((ptr, size, location, hstream))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
-    monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1)
-    monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
+    monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
+    monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
 
     managed_memory.prefetch(buffer, device, stream=stream)
 

From 90f07117615a25b45baf9722c3c1f0835c85d1c5 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 18 Mar 2026 14:16:38 -0700
Subject: [PATCH 14/31] pre-commit fix

---
 cuda_core/cuda/core/_memory/_buffer.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 104252a62b..e47f3f4926 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -35,7 +35,7 @@ else:
     BufferProtocol = object
 
 from cuda.core._dlpack import DLDeviceType, make_py_capsule
-from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core._utils.cuda_utils import driver
 from cuda.core._device import Device
 
 

From b4d252cdb5a8899d775db185d0cc9ec92c9cd474 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 19 Mar 2026 11:07:46 -0700
Subject: [PATCH 15/31] Removing blank file

---
 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd
deleted file mode 100644
index a7019c784d..0000000000
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-# Managed-memory operation helpers (advise, prefetch, discard_prefetch).
-# The public API is exposed via def functions; no cdef declarations needed.

From faaa1d881363eb4ea5d3d13cf0a21b433cdcd61f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Thu, 19 Mar 2026 13:15:08 -0700
Subject: [PATCH 16/31] wip

---
 .../cuda/core/_memory/_managed_memory_ops.pyx | 117 +++++-------------
 cuda_core/tests/test_memory.py                |  42 -------
 2 files changed, 29 insertions(+), 130 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 649c2cbe72..04dc33ed75 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -4,10 +4,7 @@
 
 from __future__ import annotations
 
-from libc.stdint cimport uintptr_t
-
-from cuda.bindings cimport cydriver
-from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs
+from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs
 from cuda.core._stream cimport Stream, Stream_accept
 
 from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
@@ -56,7 +53,6 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
     "unset_accessed_by": _DEVICE_HOST_ONLY,
 }
 
-cdef int _MANAGED_SIZE_NOT_PROVIDED = -1
 cdef int _HOST_NUMA_CURRENT_ID = 0
 cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0
 cdef size_t _SINGLE_RANGE_COUNT = 1
@@ -241,71 +237,19 @@ cdef void _require_managed_discard_prefetch_support(str what):
         )
 
 
-cdef tuple _managed_range_from_buffer(
-    Buffer buffer,
-    int size,
-    str what,
-):
-    if size != _MANAGED_SIZE_NOT_PROVIDED:
-        raise TypeError(f"{what} does not accept size= when target is a Buffer")
-    _require_managed_buffer(buffer, what)
-    return buffer.handle, buffer._size
-
-
-cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0:
-    cdef object ptr_obj
-    try:
-        ptr_obj = int(target)
-    except Exception as exc:
-        raise TypeError(
-            f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}"
-        ) from exc
-    if ptr_obj < 0:
-        raise ValueError(f"{what} target pointer must be >= 0, got {target!r}")
-    return <uintptr_t>ptr_obj
-
-
-cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1:
-    cdef _MemAttrs mem_attrs
-    with nogil:
-        _query_memory_attrs(mem_attrs, <cydriver.CUdeviceptr>ptr)
-    if not mem_attrs.is_managed:
-        raise ValueError(f"{what} requires a managed-memory allocation")
-    return 0
-
-
-cdef tuple _normalize_managed_target_range(
-    object target,
-    int size,
-    str what,
-):
-    cdef uintptr_t ptr
-
-    if isinstance(target, Buffer):
-        return _managed_range_from_buffer(<Buffer>target, size, what)
-
-    if size == _MANAGED_SIZE_NOT_PROVIDED:
-        raise TypeError(f"{what} requires size= when target is a raw pointer")
-    ptr = _coerce_raw_pointer(target, what)
-    _require_managed_pointer(ptr, what)
-    return ptr, <size_t>size
-
-
 def advise(
-    target,
+    target: Buffer,
     advice: driver.CUmem_advise | str,
     location: Device | int | None = None,
     *,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
     location_type: str | None = None,
 ):
     """Apply managed-memory advice to an allocation range.
 
     Parameters
     ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
+    target : :class:`Buffer`
+        Managed allocation to operate on.
     advice : :obj:`~driver.CUmem_advise` | str
         Managed-memory advice to apply. String aliases such as
         ``"set_read_mostly"``, ``"set_preferred_location"``, and
@@ -314,17 +258,18 @@ def advise(
         Target location. When ``location_type`` is ``None``, values are
         interpreted as a device ordinal, ``-1`` for host, or ``None`` for
         advice values that ignore location.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
     location_type : str | None, optional
         Explicit location kind. Supported values are ``"device"``, ``"host"``,
         ``"host_numa"``, and ``"host_numa_current"``.
     """
+    if not isinstance(target, Buffer):
+        raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}")
+    cdef Buffer buf = <Buffer>target
+    _require_managed_buffer(buf, "advise")
     cdef str advice_name
-    cdef object ptr
-    cdef size_t nbytes
+    cdef object ptr = buf.handle
+    cdef size_t nbytes = buf._size
 
-    ptr, nbytes = _normalize_managed_target_range(target, size, "advise")
     advice_name, advice = _normalize_managed_advice(advice)
     location = _normalize_managed_location(
         location,
@@ -347,37 +292,36 @@ def advise(
 
 
 def prefetch(
-    target,
+    target: Buffer,
     location: Device | int | None = None,
     *,
     stream: Stream | GraphBuilder,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
     location_type: str | None = None,
 ):
     """Prefetch a managed-memory allocation range to a target location.
 
     Parameters
     ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
+    target : :class:`Buffer`
+        Managed allocation to operate on.
     location : :obj:`~_device.Device` | int | None, optional
         Target location. When ``location_type`` is ``None``, values are
         interpreted as a device ordinal, ``-1`` for host, or ``None``.
         A location is required for prefetch.
     stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
         Keyword argument specifying the stream for the asynchronous prefetch.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
     location_type : str | None, optional
         Explicit location kind. Supported values are ``"device"``, ``"host"``,
         ``"host_numa"``, and ``"host_numa_current"``.
     """
+    if not isinstance(target, Buffer):
+        raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}")
+    cdef Buffer buf = <Buffer>target
+    _require_managed_buffer(buf, "prefetch")
     cdef Stream s = Stream_accept(stream)
-    cdef object ptr
-    cdef size_t nbytes
+    cdef object ptr = buf.handle
+    cdef size_t nbytes = buf._size
 
-    ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch")
     location = _normalize_managed_location(
         location,
         location_type,
@@ -405,40 +349,37 @@ def prefetch(
 
 
 def discard_prefetch(
-    target,
+    target: Buffer,
     location: Device | int | None = None,
     *,
     stream: Stream | GraphBuilder,
-    int size=_MANAGED_SIZE_NOT_PROVIDED,
     location_type: str | None = None,
 ):
     """Discard a managed-memory allocation range and prefetch it to a target location.
 
     Parameters
     ----------
-    target : :class:`Buffer` | int | object
-        Managed allocation to operate on. This may be a :class:`Buffer` or a
-        raw pointer (requires ``size=``).
+    target : :class:`Buffer`
+        Managed allocation to operate on.
     location : :obj:`~_device.Device` | int | None, optional
         Target location. When ``location_type`` is ``None``, values are
         interpreted as a device ordinal, ``-1`` for host, or ``None``.
         A location is required for discard_prefetch.
     stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
         Keyword argument specifying the stream for the asynchronous operation.
-    size : int, optional
-        Allocation size in bytes. Required when ``target`` is a raw pointer.
     location_type : str | None, optional
         Explicit location kind. Supported values are ``"device"``, ``"host"``,
         ``"host_numa"``, and ``"host_numa_current"``.
     """
-    cdef object ptr
-    cdef object batch_ptr
-    cdef size_t nbytes
-
-    ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch")
+    if not isinstance(target, Buffer):
+        raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}")
+    cdef Buffer buf = <Buffer>target
+    _require_managed_buffer(buf, "discard_prefetch")
     _require_managed_discard_prefetch_support("discard_prefetch")
     cdef Stream s = Stream_accept(stream)
-    batch_ptr = driver.CUdeviceptr(int(ptr))
+    cdef object ptr = buf.handle
+    cdef size_t nbytes = buf._size
+    cdef object batch_ptr = driver.CUdeviceptr(int(ptr))
     location = _normalize_managed_location(
         location,
         location_type,
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 544b7afc03..dbb5ac6d8c 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1441,20 +1441,6 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda):
     buffer.close()
 
 
-def test_managed_memory_advise_size_rejected_for_buffer(init_cuda):
-    """advise() raises TypeError when size= is given with a Buffer target."""
-    device = Device()
-    _skip_if_managed_allocation_unsupported(device)
-    device.set_current()
-
-    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-
-    with pytest.raises(TypeError, match="does not accept size="):
-        managed_memory.advise(buffer, "set_read_mostly", size=1024)
-
-    buffer.close()
-
-
 def test_managed_memory_advise_invalid_advice_values(init_cuda):
     """advise() rejects invalid advice strings and wrong types."""
     device = Device()
@@ -1472,34 +1458,6 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda):
     buffer.close()
 
 
-def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda):
-    device = Device()
-    _skip_if_managed_location_ops_unsupported(device)
-    device.set_current()
-
-    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-    stream = device.create_stream()
-
-    managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size)
-    assert (
-        _get_int_mem_range_attr(
-            buffer,
-            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
-        )
-        == _READ_MOSTLY_ENABLED
-    )
-
-    managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream)
-    stream.sync()
-    last_location = _get_int_mem_range_attr(
-        buffer,
-        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
-    )
-    assert last_location == device.device_id
-
-    buffer.close()
-
-
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch

From cf2f20d1be323b8cd31f76125dffad959cf0b947 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 16:46:30 -0700
Subject: [PATCH 17/31] fix(cuda.core): update binding_version import after
 upstream merge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream renamed get_binding_version → binding_version and moved it from
cuda.core._utils.cuda_utils to cuda.core._utils.version. Update the
managed-memory ops module to match.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 04dc33ed75..81ff5582a6 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -7,7 +7,8 @@ from __future__ import annotations
 from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs
 from cuda.core._stream cimport Stream, Stream_accept
 
-from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
+from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core._utils.version import binding_version
 from cuda.core._device import Device
 
 
@@ -201,7 +202,7 @@ cdef bint _managed_location_uses_v2_bindings():
     # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
     global _V2_BINDINGS
     if _V2_BINDINGS < 0:
-        _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0
+        _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0
     return _V2_BINDINGS != 0
 
 

From db3bac2e042ff07b6ab37f510f2fe06bc1cbc598 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 16:46:36 -0700
Subject: [PATCH 18/31] revert: drop managed_memory shim in
 cuda.core.experimental

The cuda.core.experimental namespace is being deprecated and should not
gain new submodules. Per review feedback, the managed_memory module
should only be reachable via cuda.core.managed_memory, not via the
experimental compatibility shim.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/experimental/__init__.py         | 3 +--
 cuda_core/tests/test_experimental_backward_compat.py | 7 -------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 34b442173b..f65e7852a9 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -38,10 +38,9 @@ def _warn_deprecated():
 _warn_deprecated()
 
 
-from cuda.core import managed_memory, system, utils
+from cuda.core import system, utils
 
 # Make utils accessible as a submodule for backward compatibility
-__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory
 __import__("sys").modules[__spec__.name + ".utils"] = utils
 
 
diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py
index 82e2cdd5be..c3215b056a 100644
--- a/cuda_core/tests/test_experimental_backward_compat.py
+++ b/cuda_core/tests/test_experimental_backward_compat.py
@@ -38,7 +38,6 @@ def test_experimental_backward_compatibility():
     assert hasattr(cuda.core.experimental, "Device")
     assert hasattr(cuda.core.experimental, "Stream")
     assert hasattr(cuda.core.experimental, "Buffer")
-    assert hasattr(cuda.core.experimental, "managed_memory")
     assert hasattr(cuda.core.experimental, "system")
 
     # Test 2: Direct imports - should emit deprecation warning
@@ -74,7 +73,6 @@ def test_experimental_backward_compatibility():
     assert cuda.core.experimental.Linker is cuda.core.Linker
 
     # Compare singletons
-    assert cuda.core.experimental.managed_memory is cuda.core.managed_memory
     assert cuda.core.experimental.system is cuda.core.system
 
     # Test 4: Utils module works
@@ -90,11 +88,6 @@ def test_experimental_backward_compatibility():
 
     assert StridedMemoryView is not None
     assert args_viewable_as_strided_memory is not None
-    from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch
-
-    assert advise is not None
-    assert prefetch is not None
-    assert discard_prefetch is not None
 
     # Test 5: Options classes are accessible
     assert hasattr(cuda.core.experimental, "EventOptions")

From 20d036ebe1ae148222b4ad9e0fdca20502ed24de Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 16:59:58 -0700
Subject: [PATCH 19/31] feat(cuda.core): add Location dataclass for managed
 memory

Frozen dataclass with classmethod constructors for the four CUmemLocationType
kinds (device, host, host_numa, host_numa_current). Validates id constraints
in __post_init__. Re-exported from cuda.core.managed_memory.

This will replace the location=/location_type= kwargs in the upcoming
unified 1..N managed-memory ops API.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_location.py    | 51 +++++++++++++++++++
 cuda_core/cuda/core/managed_memory.py         |  3 +-
 cuda_core/tests/test_memory.py                | 43 ++++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 cuda_core/cuda/core/_memory/_managed_location.py

diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py
new file mode 100644
index 0000000000..7e2515f573
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_location.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current")
+LocationKind = Literal["device", "host", "host_numa", "host_numa_current"]
+
+
+@dataclass(frozen=True)
+class Location:
+    """Typed managed-memory location.
+
+    Use the classmethod constructors (``device``, ``host``, ``host_numa``,
+    ``host_numa_current``) rather than constructing directly.
+    """
+
+    kind: LocationKind
+    id: int | None = None
+
+    def __post_init__(self) -> None:
+        if self.kind not in _VALID_KINDS:
+            raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}")
+        if self.kind == "device":
+            if not isinstance(self.id, int) or self.id < 0:
+                raise ValueError("device id must be >= 0")
+        elif self.kind == "host_numa":
+            if not isinstance(self.id, int) or self.id < 0:
+                raise ValueError("host_numa id must be >= 0")
+        elif self.kind in ("host", "host_numa_current"):
+            if self.id is not None:
+                raise ValueError(f"{self.kind} location must have id=None")
+
+    @classmethod
+    def device(cls, device_id: int) -> "Location":
+        return cls(kind="device", id=device_id)
+
+    @classmethod
+    def host(cls) -> "Location":
+        return cls(kind="host", id=None)
+
+    @classmethod
+    def host_numa(cls, numa_id: int) -> "Location":
+        return cls(kind="host_numa", id=numa_id)
+
+    @classmethod
+    def host_numa_current(cls) -> "Location":
+        return cls(kind="host_numa_current", id=None)
diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
index 005c9ec3cf..25191fe038 100644
--- a/cuda_core/cuda/core/managed_memory.py
+++ b/cuda_core/cuda/core/managed_memory.py
@@ -4,6 +4,7 @@
 
 """Managed-memory range operations."""
 
+from cuda.core._memory._managed_location import Location
 from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch
 
-__all__ = ["advise", "discard_prefetch", "prefetch"]
+__all__ = ["Location", "advise", "discard_prefetch", "prefetch"]
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 7ff15047e8..8b3db88b8d 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1918,3 +1918,46 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
     assert buffer.handle >= 0
     assert buffer.size == 0
     assert buffer.device_id == mr.device_id
+
+
+class TestLocation:
+    def test_device_constructor(self):
+        from cuda.core.managed_memory import Location
+        loc = Location.device(0)
+        assert loc.kind == "device"
+        assert loc.id == 0
+
+    def test_host_constructor(self):
+        from cuda.core.managed_memory import Location
+        loc = Location.host()
+        assert loc.kind == "host"
+        assert loc.id is None
+
+    def test_host_numa_constructor(self):
+        from cuda.core.managed_memory import Location
+        loc = Location.host_numa(3)
+        assert loc.kind == "host_numa"
+        assert loc.id == 3
+
+    def test_host_numa_current_constructor(self):
+        from cuda.core.managed_memory import Location
+        loc = Location.host_numa_current()
+        assert loc.kind == "host_numa_current"
+        assert loc.id is None
+
+    def test_frozen(self):
+        import dataclasses
+        from cuda.core.managed_memory import Location
+        loc = Location.device(0)
+        with pytest.raises(dataclasses.FrozenInstanceError):
+            loc.id = 1
+
+    def test_invalid_device_id(self):
+        from cuda.core.managed_memory import Location
+        with pytest.raises(ValueError, match="device id must be >= 0"):
+            Location.device(-1)
+
+    def test_invalid_kind(self):
+        from cuda.core.managed_memory import Location
+        with pytest.raises(ValueError, match="kind must be one of"):
+            Location(kind="not_a_kind", id=None)

From c2dae533f073fab65d81f6524be78d9c2e129d1e Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 17:02:51 -0700
Subject: [PATCH 20/31] feat(cuda.core): add _coerce_location helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Centralizes back-compat coercion for managed-memory Location inputs:
- Location → passthrough
- Device → Location.device(device_id)
- int >= 0 → Location.device(int)
- int == -1 → Location.host()
- None → None when allow_none=True, else ValueError

Will be used by the unified 1..N managed-memory ops API.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_location.py    | 29 ++++++++++++
 cuda_core/tests/test_memory.py                | 44 +++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py
index 7e2515f573..e081a8da32 100644
--- a/cuda_core/cuda/core/_memory/_managed_location.py
+++ b/cuda_core/cuda/core/_memory/_managed_location.py
@@ -49,3 +49,32 @@ def host_numa(cls, numa_id: int) -> "Location":
     @classmethod
     def host_numa_current(cls) -> "Location":
         return cls(kind="host_numa_current", id=None)
+
+
+def _coerce_location(value, *, allow_none: bool = False) -> Location | None:
+    """Coerce user input to a Location instance.
+
+    Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device,
+    -1 → host), None (only if allow_none=True).
+    """
+    from cuda.core._device import Device  # avoid import cycle at module load
+
+    if isinstance(value, Location):
+        return value
+    if isinstance(value, Device):
+        return Location.device(value.device_id)
+    if value is None:
+        if allow_none:
+            return None
+        raise ValueError("location is required")
+    if isinstance(value, int):
+        if value == -1:
+            return Location.host()
+        if value >= 0:
+            return Location.device(value)
+        raise ValueError(
+            f"device ordinal must be >= 0 (or -1 for host), got {value}"
+        )
+    raise TypeError(
+        f"location must be a Location, Device, int, or None; got {type(value).__name__}"
+    )
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8b3db88b8d..bccc0fa67b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1961,3 +1961,47 @@ def test_invalid_kind(self):
         from cuda.core.managed_memory import Location
         with pytest.raises(ValueError, match="kind must be one of"):
             Location(kind="not_a_kind", id=None)
+
+
+class TestLocationCoerce:
+    def test_passthrough(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        from cuda.core.managed_memory import Location
+        loc = Location.device(0)
+        assert _coerce_location(loc) is loc
+
+    def test_int_device(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        assert _coerce_location(0).kind == "device"
+        assert _coerce_location(0).id == 0
+
+    def test_int_minus_one_is_host(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        assert _coerce_location(-1).kind == "host"
+
+    def test_device_object(self, init_cuda):
+        from cuda.core import Device
+        from cuda.core._memory._managed_location import _coerce_location
+        dev = Device()
+        loc = _coerce_location(dev)
+        assert loc.kind == "device"
+        assert loc.id == dev.device_id
+
+    def test_none_when_disallowed(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        with pytest.raises(ValueError, match="location is required"):
+            _coerce_location(None, allow_none=False)
+
+    def test_none_when_allowed(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        assert _coerce_location(None, allow_none=True) is None
+
+    def test_bad_int(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        with pytest.raises(ValueError, match="device ordinal"):
+            _coerce_location(-2)
+
+    def test_bad_type(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        with pytest.raises(TypeError, match="Location, Device, int, or None"):
+            _coerce_location("device")

From 935c8ba7b34a8c7e3afc391318d480baee23a551 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 17:18:37 -0700
Subject: [PATCH 21/31] test(cuda.core): update monkeypatch target after
 binding_version rename

The legacy-bindings monkeypatch tests still referenced get_binding_version,
which was renamed to binding_version in cf2f20d1be. Update both occurrences.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/tests/test_memory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index bccc0fa67b..2304c370fd 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1371,7 +1371,7 @@ def fake_cuMemAdvise(ptr, size, advice, location):
         calls.append((ptr, size, advice, location))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION)
     monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
     monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise)
 
@@ -1396,7 +1396,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
         calls.append((ptr, size, location, hstream))
         return (driver.CUresult.CUDA_SUCCESS,)
 
-    monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION)
+    monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION)
     monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
     monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
 

From dc4653513bc04d1ce1fe1214630fdf628f13ef8a Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 17:19:59 -0700
Subject: [PATCH 22/31] refactor(cuda.core): tighten memory-attr query

Address review feedback on _buffer.pyx:

- Restore `inline` on `_init_mem_attrs` and `_query_memory_attrs`.
- Set `out.is_managed = (is_managed != 0)` once outside the if/elif,
  rather than per-branch (driver leaves the attribute zero for
  non-managed pointers, so all three branches converged on the same
  value anyway).
- Add a TODO noting that HMM/ATS-enabled sysmem should also report
  `is_managed=True`; the CU_POINTER_ATTRIBUTE_IS_MANAGED query does
  not capture that yet.

The Cython modernization of _managed_memory_ops.pyx (cimport cydriver,
IF/ELSE for the 12/13 ABI split) is folded into Tasks 5-8 where the
public API is being rewritten anyway; doing it here would mean
rewriting the same call sites twice.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_buffer.pyx | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 6c7f8ffd14..4ca8650e8d 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -422,14 +422,14 @@ cdef class Buffer:
 
 # Memory Attribute Query Helpers
 # ------------------------------
-cdef void _init_mem_attrs(Buffer self):
+cdef inline void _init_mem_attrs(Buffer self):
     """Initialize memory attributes by querying the pointer."""
     if not self._mem_attrs_inited:
         _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr))
         self._mem_attrs_inited = True
 
 
-cdef int _query_memory_attrs(
+cdef inline int _query_memory_attrs(
     _MemAttrs& out,
     cydriver.CUdeviceptr ptr
 ) except -1 nogil:
@@ -456,12 +456,15 @@ cdef int _query_memory_attrs(
         ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
     HANDLE_RETURN(ret)
 
+    # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the
+    # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet.
+    out.is_managed = is_managed != 0
+
     if memory_type == 0:
         # unregistered host pointer
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
-        out.is_managed = False
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -470,12 +473,10 @@ cdef int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = is_managed != 0
     elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
         out.is_host_accessible = False
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = False
     else:
         with cython.gil:
             raise ValueError(f"Unsupported memory type: {memory_type}")

From 818f5d25d8416245b5f781d3d06b5c751337eaa6 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 17:53:12 -0700
Subject: [PATCH 23/31] feat(cuda.core): unified 1..N managed_memory.prefetch
 with cydriver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite prefetch() with the unified single-or-batched signature targeted by
issue #1333:

- prefetch(targets, location, *, options=None, stream)
- targets accepts a single Buffer or a sequence of Buffers
- location accepts a Location dataclass, Device, int (-1 = host), or a
  sequence broadcasting to per-buffer locations
- length mismatch raises ValueError; empty targets raises ValueError
- options is reserved for future per-call flags and must be None
- stream moved to the end, kept keyword-only

Internals: switch from Python-level driver.cuMemPrefetchAsync to
Cython-level cydriver.cuMemPrefetchAsync via cimport cydriver, with
HANDLE_RETURN. Replace the runtime _V2_BINDINGS check with compile-time
IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE per the codebase precedent in
_managed_memory_resource.pyx, _memory_pool.pyx, _tensor_map.pyx.

N>1 dispatches to cydriver.cuMemPrefetchBatchAsync (CUDA 13 only); on
CUDA 12 builds, batched prefetch raises NotImplementedError. Single-range
prefetch continues to work on both CUDA 12 and 13 builds.

The location_type= keyword is removed; callers express location kind via
the Location dataclass added in 20d036ebe1.

The advise() and discard_prefetch() functions still use the legacy
_normalize_managed_location helper and Python-level driver calls; they
will be migrated in their own tasks.

Also drops test_managed_memory_prefetch_uses_legacy_bindings_signature,
which monkeypatched the Python-level driver.cuMemPrefetchAsync — no
longer applicable since the prefetch path uses cydriver. The corresponding
advise legacy-bindings test stays for now (advise still uses Python driver).

Closes Andy-Jost's review comment that the existing API is "non-Pythonic"
by making it Pythonic in a different direction (typed Location dataclass)
while preserving the free-function shape pending Leo's tie-break on
ManagedBuffer subclass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_memory_ops.pyx | 210 ++++++++++++++----
 cuda_core/tests/test_memory.py                | 147 +++++++++---
 2 files changed, 284 insertions(+), 73 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 81ff5582a6..b608b532ab 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -4,12 +4,19 @@
 
 from __future__ import annotations
 
+from cpython.mem cimport PyMem_Free, PyMem_Malloc
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs
+from cuda.core._resource_handles cimport as_cu
 from cuda.core._stream cimport Stream, Stream_accept
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from cuda.core._utils.cuda_utils import driver, handle_return
 from cuda.core._utils.version import binding_version
 from cuda.core._device import Device
+from cuda.core._memory._managed_location import Location, _coerce_location
 
 
 cdef tuple _VALID_MANAGED_LOCATION_TYPES = (
@@ -228,6 +235,74 @@ cdef void _require_managed_buffer(Buffer self, str what):
         raise ValueError(f"{what} requires a managed-memory allocation")
 
 
+# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...].
+cdef tuple _coerce_buffer_targets(object targets, str what):
+    cdef list out
+    if isinstance(targets, Buffer):
+        return (<Buffer>targets,)
+    if isinstance(targets, (list, tuple)):
+        if not targets:
+            raise ValueError(f"{what}: empty targets sequence")
+        out = []
+        for t in targets:
+            if not isinstance(t, Buffer):
+                raise TypeError(
+                    f"{what}: each target must be a Buffer, got {type(t).__name__}"
+                )
+            out.append(t)
+        return tuple(out)
+    raise TypeError(
+        f"{what}: targets must be a Buffer or sequence of Buffer, "
+        f"got {type(targets).__name__}"
+    )
+
+
+# Broadcast a single location across ``n`` targets, or coerce a length-N
+# sequence elementwise.
+cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what):
+    cdef object coerced
+    if isinstance(location, (list, tuple)):
+        if len(location) != n:
+            raise ValueError(
+                f"{what}: location length {len(location)} does not match "
+                f"targets length {n}"
+            )
+        return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location)
+    coerced = _coerce_location(location, allow_none=allow_none)
+    return tuple([coerced] * n)
+
+
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    # Convert a Location dataclass to a cydriver.CUmemLocation struct.
+    cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc):
+        cdef cydriver.CUmemLocation out
+        cdef str kind = loc.kind
+        if kind == "device":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            out.id = <int>loc.id
+        elif kind == "host":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+            out.id = 0
+        elif kind == "host_numa":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
+            out.id = <int>loc.id
+        else:  # host_numa_current
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
+            out.id = 0
+        return out
+ELSE:
+    # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host).
+    cdef inline int _to_legacy_device(object loc) except? -2:
+        cdef str kind = loc.kind
+        if kind == "device":
+            return <int>loc.id
+        if kind == "host":
+            return -1
+        raise RuntimeError(
+            f"location_type={kind!r} requires a CUDA 13 build of cuda.core"
+        )
+
+
 cdef void _require_managed_discard_prefetch_support(str what):
     global _DISCARD_PREFETCH_SUPPORTED
     if _DISCARD_PREFETCH_SUPPORTED < 0:
@@ -293,59 +368,106 @@ def advise(
 
 
 def prefetch(
-    target: Buffer,
-    location: Device | int | None = None,
+    targets,
+    location=None,
     *,
-    stream: Stream | GraphBuilder,
-    location_type: str | None = None,
+    options=None,
+    stream,
 ):
-    """Prefetch a managed-memory allocation range to a target location.
+    """Prefetch one or more managed-memory ranges to a target location.
 
     Parameters
     ----------
-    target : :class:`Buffer`
-        Managed allocation to operate on.
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None``.
-        A location is required for prefetch.
-    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
-        Keyword argument specifying the stream for the asynchronous prefetch.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to operate on.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). A single location applies to all targets; a
+        sequence must match ``len(targets)``. ``Device`` and ``int`` values
+        are coerced to :class:`Location` (``-1`` maps to host).
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous prefetch (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``.
     """
-    if not isinstance(target, Buffer):
-        raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}")
-    cdef Buffer buf = <Buffer>target
-    _require_managed_buffer(buf, "prefetch")
+    if options is not None:
+        raise TypeError(
+            f"prefetch options must be None (reserved); got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, False, "prefetch")
     cdef Stream s = Stream_accept(stream)
-    cdef object ptr = buf.handle
-    cdef size_t nbytes = buf._size
 
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "prefetch",
-    )
-    if _managed_location_uses_v2_bindings():
-        handle_return(
-            driver.cuMemPrefetchAsync(
-                ptr,
-                nbytes,
-                location,
-                _MANAGED_OPERATION_FLAGS,
-                s.handle,
-            )
-        )
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "prefetch")
+
+    if n == 1:
+        _do_single_prefetch(<Buffer>bufs[0], locs[0], s)
     else:
-        handle_return(
-            driver.cuMemPrefetchAsync(
-                ptr,
-                nbytes,
-                _managed_location_to_legacy_device(location, "prefetch"),
-                s.handle,
-            )
+        _do_batch_prefetch(bufs, locs, s)
+
+
+cdef void _do_single_prefetch(Buffer buf, object loc, Stream s):
+    cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
+    cdef size_t nbytes = buf._size
+    cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream))
+    ELSE:
+        cdef int dev_int = _to_legacy_device(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream))
+
+
+cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        cdef cydriver.CUmemLocation* loc_arr = <cydriver.CUmemLocation*>PyMem_Malloc(
+            n * sizeof(cydriver.CUmemLocation)
+        )
+        cdef size_t* loc_indices = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes and loc_arr and loc_indices):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+                loc_arr[i] = _to_cumemlocation(locs[i])
+                loc_indices[i] = <size_t>i
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync(
+                    ptrs, sizes, <size_t>n,
+                    loc_arr, loc_indices, <size_t>n,
+                    0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+    ELSE:
+        raise NotImplementedError(
+            "batched prefetch requires a CUDA 13 build of cuda.core"
         )
 
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 2304c370fd..89c8fda1c0 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1383,32 +1383,6 @@ def fake_cuMemAdvise(ptr, size, advice, location):
     buffer.close()
 
 
-def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda):
-    device = Device()
-    _skip_if_managed_location_ops_unsupported(device)
-    device.set_current()
-
-    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-    stream = device.create_stream()
-    calls = []
-
-    def fake_cuMemPrefetchAsync(ptr, size, location, hstream):
-        calls.append((ptr, size, location, hstream))
-        return (driver.CUresult.CUDA_SUCCESS,)
-
-    monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION)
-    monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
-    monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync)
-
-    managed_memory.prefetch(buffer, device, stream=stream)
-
-    assert len(calls) == 1
-    assert calls[0][2] == device.device_id
-    assert int(calls[0][3]) == int(stream.handle)
-
-    buffer.close()
-
-
 def test_managed_memory_operations_reject_non_managed_allocations(init_cuda):
     device = Device()
     device.set_current()
@@ -1435,12 +1409,10 @@ def test_managed_memory_operation_validation(init_cuda):
     buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    with pytest.raises(ValueError, match="requires a location"):
+    with pytest.raises(ValueError, match="location is required"):
         managed_memory.prefetch(buffer, stream=stream)
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
         managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa")
-    with pytest.raises(ValueError, match="location must be None or -1"):
-        managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host")
 
     buffer.close()
 
@@ -2005,3 +1977,120 @@ def test_bad_type(self):
         from cuda.core._memory._managed_location import _coerce_location
         with pytest.raises(TypeError, match="Location, Device, int, or None"):
             _coerce_location("device")
+
+
+class TestPrefetch:
+    def test_single_with_location_host(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+
+        prefetch(buf, Location.host(), stream=stream)
+        stream.sync()
+        last = _get_int_mem_range_attr(
+            buf,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last == _HOST_LOCATION_ID
+        buf.close()
+
+    def test_batched_same_location(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
+            pytest.skip("cuMemPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)]
+        stream = device.create_stream()
+
+        prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+
+        for buf in bufs:
+            last = _get_int_mem_range_attr(
+                buf,
+                driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+            )
+            assert last == device.device_id
+            buf.close()
+
+    def test_batched_per_buffer_location(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
+            pytest.skip("cuMemPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+
+        prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream)
+        stream.sync()
+
+        last0 = _get_int_mem_range_attr(
+            bufs[0],
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        last1 = _get_int_mem_range_attr(
+            bufs[1],
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last0 == _HOST_LOCATION_ID
+        assert last1 == device.device_id
+        for buf in bufs:
+            buf.close()
+
+    def test_length_mismatch(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+
+        with pytest.raises(ValueError, match="length"):
+            prefetch(bufs, [Location.host()], stream=stream)
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            prefetch(buf, Location.host(), stream=stream)
+        buf.close()
+
+    def test_location_required(self, init_cuda):
+        from cuda.core.managed_memory import prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="location is required"):
+            prefetch(buf, None, stream=stream)
+        buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.managed_memory import Location, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(TypeError, match="must be None"):
+            prefetch(buf, Location.host(), options={}, stream=stream)
+        buf.close()

From e296e72986b124dcbb07027e17160a5e0290b8b0 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 17:59:24 -0700
Subject: [PATCH 24/31] feat(cuda.core): add managed_memory.discard

Adds a new discard(targets, *, options=None, stream) free function that
wraps cuMemDiscardBatchAsync. Accepts a single Buffer or a sequence;
N>=1 dispatches to the batched driver entry point. Requires a CUDA 13
build of cuda.core (NotImplementedError on CUDA 12 builds).

Closes the second of three batched managed-memory operations from #1333:
  P1: cudaMemDiscardBatchAsync               <- this commit
  P1: cudaMemPrefetchBatchAsync              <- 818f5d25d8
  P1: cudaMemDiscardAndPrefetchBatchAsync    <- next commit

Re-exported from cuda.core.managed_memory.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_memory_ops.pyx | 71 +++++++++++++++++++
 cuda_core/cuda/core/managed_memory.py         |  4 +-
 cuda_core/tests/test_memory.py                | 57 +++++++++++++++
 3 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index b608b532ab..031b56a8af 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -313,6 +313,77 @@ cdef void _require_managed_discard_prefetch_support(str what):
         )
 
 
+def discard(
+    targets,
+    *,
+    options=None,
+    stream,
+):
+    """Discard one or more managed-memory ranges.
+
+    Parameters
+    ----------
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to discard. Their resident pages
+        are released without prefetching new contents; subsequent access
+        is satisfied by lazy migration.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous discard (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+.
+    """
+    if options is not None:
+        raise TypeError(
+            f"discard options must be None (reserved); got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "discard")
+    cdef Py_ssize_t n = len(bufs)
+    cdef Stream s = Stream_accept(stream)
+
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "discard")
+
+    _do_batch_discard(bufs, s)
+
+
+cdef void _do_batch_discard(tuple bufs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync(
+                    ptrs, sizes, <size_t>n, 0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+    ELSE:
+        raise NotImplementedError(
+            "discard requires a CUDA 13 build of cuda.core"
+        )
+
+
 def advise(
     target: Buffer,
     advice: driver.CUmem_advise | str,
diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
index 25191fe038..509e874ccc 100644
--- a/cuda_core/cuda/core/managed_memory.py
+++ b/cuda_core/cuda/core/managed_memory.py
@@ -5,6 +5,6 @@
 """Managed-memory range operations."""
 
 from cuda.core._memory._managed_location import Location
-from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch
+from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch
 
-__all__ = ["Location", "advise", "discard_prefetch", "prefetch"]
+__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"]
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 89c8fda1c0..c18fa72519 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -2094,3 +2094,60 @@ def test_options_must_be_none(self, init_cuda):
         with pytest.raises(TypeError, match="must be None"):
             prefetch(buf, Location.host(), options={}, stream=stream)
         buf.close()
+
+
+class TestDiscard:
+    def test_single_buffer(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardBatchAsync"):
+            pytest.skip("cuMemDiscardBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        prefetch(buf, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        discard(buf, stream=stream)
+        stream.sync()
+        buf.close()
+
+    def test_batched(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardBatchAsync"):
+            pytest.skip("cuMemDiscardBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)]
+        stream = device.create_stream()
+        prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        discard(bufs, stream=stream)
+        stream.sync()
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.managed_memory import discard
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            discard(buf, stream=stream)
+        buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.managed_memory import discard
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(TypeError, match="must be None"):
+            discard(buf, options={}, stream=stream)
+        buf.close()

From e697131defa9c65cce468b8f946e0f16f442744a Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:01:33 -0700
Subject: [PATCH 25/31] feat(cuda.core): unified 1..N
 managed_memory.discard_prefetch with cydriver

Rewrite discard_prefetch() with the unified single-or-batched signature:

  discard_prefetch(targets, location, *, options=None, stream)

- targets accepts a single Buffer or a sequence of Buffers
- location accepts a Location, Device, int, or per-buffer sequence
- length mismatch / empty targets raise ValueError
- options must be None (reserved)
- stream moved to end, kept keyword-only

Internals: switch from Python-level driver.cuMemDiscardAndPrefetchBatchAsync
to Cython-level cydriver.cuMemDiscardAndPrefetchBatchAsync. The runtime
discard-prefetch availability check is replaced by compile-time
IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE; on CUDA 12 builds the call raises
NotImplementedError.

The location_type= keyword is removed; use Location dataclass instead.

Closes the third managed-memory batched op from #1333.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_memory_ops.pyx | 117 ++++++++++++------
 cuda_core/tests/test_memory.py                |  70 +++++++++++
 2 files changed, 147 insertions(+), 40 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 031b56a8af..2192688320 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -543,51 +543,88 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
 
 
 def discard_prefetch(
-    target: Buffer,
-    location: Device | int | None = None,
+    targets,
+    location=None,
     *,
-    stream: Stream | GraphBuilder,
-    location_type: str | None = None,
+    options=None,
+    stream,
 ):
-    """Discard a managed-memory allocation range and prefetch it to a target location.
+    """Discard one or more managed-memory ranges and prefetch them to a target location.
 
     Parameters
     ----------
-    target : :class:`Buffer`
-        Managed allocation to operate on.
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None``.
-        A location is required for discard_prefetch.
-    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
-        Keyword argument specifying the stream for the asynchronous operation.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to discard and re-prefetch.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). A single location applies to all targets;
+        a sequence must match ``len(targets)``.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous operation (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch
+        requires CUDA 13+.
     """
-    if not isinstance(target, Buffer):
-        raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}")
-    cdef Buffer buf = <Buffer>target
-    _require_managed_buffer(buf, "discard_prefetch")
-    _require_managed_discard_prefetch_support("discard_prefetch")
+    if options is not None:
+        raise TypeError(
+            f"discard_prefetch options must be None (reserved); "
+            f"got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch")
     cdef Stream s = Stream_accept(stream)
-    cdef object ptr = buf.handle
-    cdef size_t nbytes = buf._size
-    cdef object batch_ptr = driver.CUdeviceptr(int(ptr))
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "discard_prefetch",
-    )
-    handle_return(
-        driver.cuMemDiscardAndPrefetchBatchAsync(
-            [batch_ptr],
-            [nbytes],
-            _SINGLE_RANGE_COUNT,
-            [location],
-            [_FIRST_PREFETCH_LOCATION_INDEX],
-            _SINGLE_PREFETCH_LOCATION_COUNT,
-            _MANAGED_OPERATION_FLAGS,
-            s.handle,
+
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "discard_prefetch")
+
+    _do_batch_discard_prefetch(bufs, locs, s)
+
+
+cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        cdef cydriver.CUmemLocation* loc_arr = <cydriver.CUmemLocation*>PyMem_Malloc(
+            n * sizeof(cydriver.CUmemLocation)
+        )
+        cdef size_t* loc_indices = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes and loc_arr and loc_indices):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+                loc_arr[i] = _to_cumemlocation(locs[i])
+                loc_indices[i] = <size_t>i
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync(
+                    ptrs, sizes, <size_t>n,
+                    loc_arr, loc_indices, <size_t>n,
+                    0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+    ELSE:
+        raise NotImplementedError(
+            "discard_prefetch requires a CUDA 13 build of cuda.core"
         )
-    )
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index c18fa72519..627a60bb3f 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -2151,3 +2151,73 @@ def test_options_must_be_none(self, init_cuda):
         with pytest.raises(TypeError, match="must be None"):
             discard(buf, options={}, stream=stream)
         buf.close()
+
+
+class TestDiscardPrefetch:
+    def test_single_buffer(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+            pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+
+        prefetch(buf, Location.host(), stream=stream)
+        stream.sync()
+        discard_prefetch(buf, Location.device(device.device_id), stream=stream)
+        stream.sync()
+
+        last = _get_int_mem_range_attr(
+            buf,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last == device.device_id
+        buf.close()
+
+    def test_batched_same_location(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+            pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+        prefetch(bufs, Location.host(), stream=stream)
+        stream.sync()
+        discard_prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        for buf in bufs:
+            last = _get_int_mem_range_attr(
+                buf,
+                driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+            )
+            assert last == device.device_id
+            buf.close()
+
+    def test_length_mismatch(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard_prefetch
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="length"):
+            discard_prefetch(bufs, [Location.host()], stream=stream)
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.managed_memory import Location, discard_prefetch
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            discard_prefetch(buf, Location.host(), stream=stream)
+        buf.close()

From 3bc10219dc3086d5449aa811e2f6086b73d915fb Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:08:40 -0700
Subject: [PATCH 26/31] feat(cuda.core): unified 1..N managed_memory.advise +
 drop legacy apparatus

Rewrite advise() with the unified single-or-batched signature:

  advise(targets, advice, location=None, *, options=None)

- targets accepts a single Buffer or a sequence
- advice still accepts string aliases or driver.CUmem_advise enum values
- location accepts Location dataclass, Device, int, None, or per-buffer
  sequence (None permitted only for set_read_mostly, unset_read_mostly,
  unset_preferred_location)
- Per-advice allowed-kind validation ported to operate on Location.kind
  (matches CUDA driver constraints from existing tables)
- options reserved for future per-call flags
- For N>1, loops cydriver.cuMemAdvise per buffer (no batched advise API
  exists in CUDA)

Internals: switch to cydriver.cuMemAdvise (Cython-level); use compile-time
IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE for the 12/13 ABI split.

Drop the legacy apparatus that all four functions previously shared:
- _normalize_managed_location (returned Python driver.CUmemLocation)
- _make_managed_location, _managed_location_enum
- _managed_location_uses_v2_bindings + _V2_BINDINGS lazy cache
- _managed_location_to_legacy_device + _LEGACY_LOC_DEVICE/HOST cache
- _require_managed_discard_prefetch_support
- Unused module-level constants (_HOST_NUMA_CURRENT_ID,
  _SINGLE_RANGE_COUNT, _MANAGED_OPERATION_FLAGS, etc.)

Also drop test_managed_memory_advise_uses_legacy_bindings_signature and
the _LEGACY_BINDINGS_VERSION constant; the runtime version switch is
gone, replaced by compile-time IF/ELSE that the test could not exercise.
The CUDA 12 vs CUDA 13 paths are now covered by the build-matrix CI job.

Closes Task 8 (advise) and Task 9 (legacy-bindings test cleanup) from
docs/superpowers/plans/2026-04-27-managed-memory-ops-batched.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_memory_ops.pyx | 270 +++++-------------
 cuda_core/tests/test_memory.py                |  91 ++++--
 2 files changed, 127 insertions(+), 234 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 2192688320..11236a1ecf 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -13,26 +13,10 @@ from cuda.core._resource_handles cimport as_cu
 from cuda.core._stream cimport Stream, Stream_accept
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
-from cuda.core._utils.cuda_utils import driver, handle_return
-from cuda.core._utils.version import binding_version
-from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import driver
 from cuda.core._memory._managed_location import Location, _coerce_location
 
 
-cdef tuple _VALID_MANAGED_LOCATION_TYPES = (
-    "device",
-    "host",
-    "host_numa",
-    "host_numa_current",
-)
-
-cdef dict _MANAGED_LOCATION_TYPE_ATTRS = {
-    "device": "CU_MEM_LOCATION_TYPE_DEVICE",
-    "host": "CU_MEM_LOCATION_TYPE_HOST",
-    "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA",
-    "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
-}
-
 cdef dict _MANAGED_ADVICE_ALIASES = {
     "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
     "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
@@ -61,43 +45,8 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
     "unset_accessed_by": _DEVICE_HOST_ONLY,
 }
 
-cdef int _HOST_NUMA_CURRENT_ID = 0
-cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0
-cdef size_t _SINGLE_RANGE_COUNT = 1
-cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1
-cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0
-
-# Lazily cached values for immutable runtime properties.
-cdef object _CU_DEVICE_CPU = None
+# Lazily cached: maps driver.CUmem_advise enum value → string alias.
 cdef dict _ADVICE_ENUM_TO_ALIAS = None
-_V2_BINDINGS = -1
-cdef int _DISCARD_PREFETCH_SUPPORTED = -1
-
-
-cdef object _managed_location_enum(str location_type):
-    cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type]
-    cdef object result = getattr(driver.CUmemLocationType, attr_name, None)
-    if result is None:
-        raise RuntimeError(
-            f"Managed-memory location type {location_type!r} is not supported by the "
-            f"installed cuda.bindings package."
-        )
-    return result
-
-
-cdef object _make_managed_location(str location_type, int location_id):
-    global _CU_DEVICE_CPU
-    cdef object location = driver.CUmemLocation()
-    location.type = _managed_location_enum(location_type)
-    if location_type == "host":
-        if _CU_DEVICE_CPU is None:
-            _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1))
-        location.id = _CU_DEVICE_CPU
-    elif location_type == "host_numa_current":
-        location.id = _HOST_NUMA_CURRENT_ID
-    else:
-        location.id = location_id
-    return location
 
 
 cdef tuple _normalize_managed_advice(object advice):
@@ -131,104 +80,6 @@ cdef tuple _normalize_managed_advice(object advice):
     )
 
 
-cdef object _normalize_managed_location(
-    object location,
-    object location_type,
-    str what,
-    bint allow_none=False,
-    frozenset allowed_loctypes=_ALL_LOCATION_TYPES,
-):
-    cdef object loc_type
-    cdef int loc_id
-
-    if isinstance(location, Device):
-        location = location.device_id
-
-    if location_type is not None and not isinstance(location_type, str):
-        raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}")
-
-    loc_type = None if location_type is None else (<str>location_type).lower()
-    if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES:
-        raise ValueError(
-            f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} "
-            f"or None, got {location_type!r}"
-        )
-
-    if loc_type is not None and loc_type not in allowed_loctypes:
-        raise ValueError(f"{what} does not support location_type='{loc_type}'")
-
-    if loc_type is None:
-        if location is None:
-            if allow_none:
-                return _make_managed_location("host", -1)
-            raise ValueError(f"{what} requires a location")
-        if not isinstance(location, int):
-            raise TypeError(
-                f"{what} location must be a Device, int, or None, got {type(location).__name__}"
-            )
-        loc_id = <int>location
-        if loc_id == -1:
-            if "host" not in allowed_loctypes:
-                raise ValueError(f"{what} does not support host locations")
-            return _make_managed_location("host", -1)
-        elif loc_id >= 0:
-            return _make_managed_location("device", loc_id)
-        else:
-            raise ValueError(
-                f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}"
-            )
-    elif loc_type == "device":
-        if isinstance(location, int) and <int>location >= 0:
-            loc_id = <int>location
-        else:
-            raise ValueError(
-                f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}"
-            )
-        return _make_managed_location(loc_type, loc_id)
-    elif loc_type == "host":
-        if location not in (None, -1):
-            raise ValueError(
-                f"{what} location must be None or -1 when location_type is 'host', got {location!r}"
-            )
-        return _make_managed_location(loc_type, -1)
-    elif loc_type == "host_numa":
-        if not isinstance(location, int) or <int>location < 0:
-            raise ValueError(
-                f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}"
-            )
-        return _make_managed_location(loc_type, <int>location)
-    else:
-        if location is not None:
-            raise ValueError(
-                f"{what} location must be None when location_type is 'host_numa_current', got {location!r}"
-            )
-        return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID)
-
-
-cdef bint _managed_location_uses_v2_bindings():
-    # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers.
-    global _V2_BINDINGS
-    if _V2_BINDINGS < 0:
-        _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0
-    return _V2_BINDINGS != 0
-
-
-cdef object _LEGACY_LOC_DEVICE = None
-cdef object _LEGACY_LOC_HOST = None
-
-cdef int _managed_location_to_legacy_device(object location, str what):
-    global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST
-    if _LEGACY_LOC_DEVICE is None:
-        _LEGACY_LOC_DEVICE = _managed_location_enum("device")
-        _LEGACY_LOC_HOST = _managed_location_enum("host")
-    cdef object loc_type = location.type
-    if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST:
-        return <int>location.id
-    raise RuntimeError(
-        f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}"
-    )
-
-
 cdef void _require_managed_buffer(Buffer self, str what):
     _init_mem_attrs(self)
     if not self._mem_attrs.is_managed:
@@ -303,16 +154,6 @@ ELSE:
         )
 
 
-cdef void _require_managed_discard_prefetch_support(str what):
-    global _DISCARD_PREFETCH_SUPPORTED
-    if _DISCARD_PREFETCH_SUPPORTED < 0:
-        _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0
-    if not _DISCARD_PREFETCH_SUPPORTED:
-        raise RuntimeError(
-            f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync"
-        )
-
-
 def discard(
     targets,
     *,
@@ -385,57 +226,80 @@ cdef void _do_batch_discard(tuple bufs, Stream s):
 
 
 def advise(
-    target: Buffer,
-    advice: driver.CUmem_advise | str,
-    location: Device | int | None = None,
+    targets,
+    advice,
+    location=None,
     *,
-    location_type: str | None = None,
+    options=None,
 ):
-    """Apply managed-memory advice to an allocation range.
+    """Apply managed-memory advice to one or more allocation ranges.
 
     Parameters
     ----------
-    target : :class:`Buffer`
-        Managed allocation to operate on.
-    advice : :obj:`~driver.CUmem_advise` | str
-        Managed-memory advice to apply. String aliases such as
-        ``"set_read_mostly"``, ``"set_preferred_location"``, and
-        ``"set_accessed_by"`` are accepted.
-    location : :obj:`~_device.Device` | int | None, optional
-        Target location. When ``location_type`` is ``None``, values are
-        interpreted as a device ordinal, ``-1`` for host, or ``None`` for
-        advice values that ignore location.
-    location_type : str | None, optional
-        Explicit location kind. Supported values are ``"device"``, ``"host"``,
-        ``"host_numa"``, and ``"host_numa_current"``.
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to advise.
+    advice : str | :obj:`~driver.CUmem_advise`
+        Managed-memory advice. String aliases (``"set_read_mostly"``,
+        ``"unset_read_mostly"``, ``"set_preferred_location"``,
+        ``"unset_preferred_location"``, ``"set_accessed_by"``,
+        ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). Required for advice values that consult a
+        location; ignored (may be ``None``) for ``set_read_mostly``,
+        ``unset_read_mostly``, and ``unset_preferred_location``. A sequence
+        must match ``len(targets)``.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
     """
-    if not isinstance(target, Buffer):
-        raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}")
-    cdef Buffer buf = <Buffer>target
-    _require_managed_buffer(buf, "advise")
+    if options is not None:
+        raise TypeError(
+            f"advise options must be None (reserved); got {type(options).__name__}"
+        )
     cdef str advice_name
-    cdef object ptr = buf.handle
-    cdef size_t nbytes = buf._size
+    cdef object advice_value
+    advice_name, advice_value = _normalize_managed_advice(advice)
+    cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION
+    cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name]
 
-    advice_name, advice = _normalize_managed_advice(advice)
-    location = _normalize_managed_location(
-        location,
-        location_type,
-        "advise",
-        allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION,
-        allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name],
-    )
-    if _managed_location_uses_v2_bindings():
-        handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location))
-    else:
-        handle_return(
-            driver.cuMemAdvise(
-                ptr,
-                nbytes,
-                advice,
-                _managed_location_to_legacy_device(location, "advise"),
+    cdef tuple bufs = _coerce_buffer_targets(targets, "advise")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise")
+
+    cdef Buffer buf
+    cdef object loc
+    for buf in bufs:
+        _require_managed_buffer(buf, "advise")
+    for loc in locs:
+        if loc is not None and loc.kind not in allowed_kinds:
+            raise ValueError(
+                f"advise '{advice_name}' does not support location_type='{loc.kind}'"
             )
-        )
+
+    cdef Py_ssize_t i
+    for i in range(n):
+        _do_single_advise(<Buffer>bufs[i], advice_value, locs[i], allow_none)
+
+
+cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none):
+    cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
+    cdef size_t nbytes = buf._size
+    cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value))
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef cydriver.CUmemLocation cu_loc
+        if loc is None:
+            # Driver ignores location for read_mostly / unset_preferred_location
+            # advice values but still validates the CUmemLocation; pass a
+            # host placeholder.
+            cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+            cu_loc.id = 0
+        else:
+            cu_loc = _to_cumemlocation(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc))
+    ELSE:
+        cdef int dev_int = -1 if loc is None else _to_legacy_device(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int))
 
 
 def prefetch(
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 627a60bb3f..a469c63a10 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -54,7 +54,6 @@
 _READ_MOSTLY_ENABLED = 1
 _HOST_LOCATION_ID = -1
 _INVALID_HOST_DEVICE_ORDINAL = 0
-_LEGACY_BINDINGS_VERSION = (12, 9)
 
 
 class DummyDeviceMemoryResource(MemoryResource):
@@ -1264,6 +1263,8 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
 
 
 def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
+    from cuda.core.managed_memory import Location
+
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
@@ -1281,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
 
     # cuda.bindings currently exposes the combined location attributes for
     # cuMemRangeGetAttribute, so use the legacy location query here.
-    managed_memory.advise(buffer, "set_preferred_location", location_type="host")
+    managed_memory.advise(buffer, "set_preferred_location", Location.host())
     preferred_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
@@ -1359,30 +1360,6 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i
     buffer.close()
 
 
-def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda):
-    device = Device()
-    _skip_if_managed_allocation_unsupported(device)
-    device.set_current()
-
-    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-    calls = []
-
-    def fake_cuMemAdvise(ptr, size, advice, location):
-        calls.append((ptr, size, advice, location))
-        return (driver.CUresult.CUDA_SUCCESS,)
-
-    monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION)
-    monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1)
-    monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise)
-
-    managed_memory.advise(buffer, "set_read_mostly")
-
-    assert len(calls) == 1
-    assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID))
-
-    buffer.close()
-
-
 def test_managed_memory_operations_reject_non_managed_allocations(init_cuda):
     device = Device()
     device.set_current()
@@ -1411,14 +1388,17 @@ def test_managed_memory_operation_validation(init_cuda):
 
     with pytest.raises(ValueError, match="location is required"):
         managed_memory.prefetch(buffer, stream=stream)
+    from cuda.core.managed_memory import Location
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
-        managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa")
+        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL))
 
     buffer.close()
 
 
 def test_managed_memory_advise_location_validation(init_cuda):
     """Verify doc-specified location constraints for each advice kind."""
+    from cuda.core.managed_memory import Location
+
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
     device.set_current()
@@ -1431,16 +1411,16 @@ def test_managed_memory_advise_location_validation(init_cuda):
     # set_preferred_location requires a location; device ordinal works
     managed_memory.advise(buffer, "set_preferred_location", device.device_id)
 
-    # set_preferred_location with host location_type
-    managed_memory.advise(buffer, "set_preferred_location", location_type="host")
+    # set_preferred_location with host location
+    managed_memory.advise(buffer, "set_preferred_location", Location.host())
 
     # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs)
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
-        managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa")
+        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0))
 
     # set_accessed_by with host_numa_current also raises ValueError
     with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"):
-        managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current")
+        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current())
 
     # Inferred location from int: -1 maps to host, 0 maps to device
     managed_memory.advise(buffer, "set_preferred_location", -1)
@@ -2221,3 +2201,52 @@ def test_rejects_non_managed(self, init_cuda):
         with pytest.raises(ValueError, match="managed-memory"):
             discard_prefetch(buf, Location.host(), stream=stream)
         buf.close()
+
+
+class TestAdvise:
+    def test_batched_same_advice(self, init_cuda):
+        from cuda.core.managed_memory import advise, Location
+        device = Device()
+        _skip_if_managed_location_ops_unsupported(device)
+        device.set_current()
+        bufs = [
+            DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+            for _ in range(2)
+        ]
+        advise(bufs, "set_read_mostly")
+        for buf in bufs:
+            assert (
+                _get_int_mem_range_attr(
+                    buf,
+                    driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+                )
+                == _READ_MOSTLY_ENABLED
+            )
+            buf.close()
+
+    def test_batched_per_buffer_location(self, init_cuda):
+        from cuda.core.managed_memory import advise, Location
+        device = Device()
+        _skip_if_managed_location_ops_unsupported(device)
+        device.set_current()
+        bufs = [
+            DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+            for _ in range(2)
+        ]
+        advise(
+            bufs,
+            "set_preferred_location",
+            [Location.host(), Location.device(device.device_id)],
+        )
+        for buf in bufs:
+            buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.managed_memory import advise
+        device = Device()
+        _skip_if_managed_allocation_unsupported(device)
+        device.set_current()
+        buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        with pytest.raises(TypeError, match="must be None"):
+            advise(buf, "set_read_mostly", options={})
+        buf.close()

From fa238696802fc762b0008a20c091e998ab7e7b2b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:10:21 -0700
Subject: [PATCH 27/31] refactor(cuda.core): use Buffer.is_managed property in
 managed_memory ops

_require_managed_buffer was poking at Buffer._mem_attrs.is_managed
directly via _init_mem_attrs(). PR #1924 added the public Buffer.is_managed
property which falls back to MemoryResource.is_managed when the pointer
attribute query does not advertise managed memory (the case for pool-
allocated managed memory).

Switch _require_managed_buffer to the public property. This also fixes
a latent bug where pool-allocated managed buffers were being rejected
by the managed_memory ops despite Buffer.is_managed correctly reporting
True.

Drops the no-longer-needed cimport of _init_mem_attrs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 11236a1ecf..f4e13ef16e 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -8,7 +8,7 @@ from cpython.mem cimport PyMem_Free, PyMem_Malloc
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs
+from cuda.core._memory._buffer cimport Buffer
 from cuda.core._resource_handles cimport as_cu
 from cuda.core._stream cimport Stream, Stream_accept
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
@@ -81,8 +81,10 @@ cdef tuple _normalize_managed_advice(object advice):
 
 
 cdef void _require_managed_buffer(Buffer self, str what):
-    _init_mem_attrs(self)
-    if not self._mem_attrs.is_managed:
+    # Buffer.is_managed handles both pointer-attribute and memory-resource
+    # paths (e.g. pool-allocated managed memory whose pointer attribute
+    # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED).
+    if not self.is_managed:
         raise ValueError(f"{what} requires a managed-memory allocation")
 
 

From 68bdd14357598b53dc7c0d7a2654b014d876f58f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:10:56 -0700
Subject: [PATCH 28/31] docs(cuda.core): document Location, discard, and 1..N
 managed_memory ops

api.rst: add Location and discard to the managed_memory autosummary.

1.0.0-notes.rst: replace the placeholder bullet with a description of the
unified 1..N API, the Location dataclass, and the dispatch to batched
driver entry points on cuda.bindings 12.8+.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/docs/source/api.rst                 |  2 ++
 cuda_core/docs/source/release/1.0.0-notes.rst | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index b7df6d7b96..fd0e01dedf 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -137,8 +137,10 @@ Managed memory
 .. autosummary::
    :toctree: generated/
 
+   Location
    advise
    prefetch
+   discard
    discard_prefetch
 
 .. module:: cuda.core
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 4008c86f5d..25e9066761 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -17,11 +17,16 @@ New features
 ------------
 
 - Added managed-memory range operations under :mod:`cuda.core.managed_memory`:
-  ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free
-  functions accept either a managed :class:`Buffer` or a raw pointer plus
-  ``size=``, validate that the target allocation is managed memory, and then
-  forward to the corresponding CUDA driver operations for range advice and
-  migration.
+  :class:`~managed_memory.Location`, :func:`~managed_memory.advise`,
+  :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and
+  :func:`~managed_memory.discard_prefetch`. Each operation accepts either a
+  single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+
+  the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver
+  entry point, addressing the managed-memory portion of #1333. Locations
+  are expressed via the typed :class:`~managed_memory.Location` dataclass
+  (with classmethod constructors ``device``, ``host``, ``host_numa``, and
+  ``host_numa_current``); ``Device`` and ``int`` values are still accepted
+  for ergonomic compatibility.
 
 
 Fixes and enhancements

From b4d9cbfa7270e7da9e260d457a1678f38bd2833d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:20:19 -0700
Subject: [PATCH 29/31] chore(cuda.core): drop narrative comments and tighten
 _coerce_location docstring

Per /simplify review, remove WHAT-only comments that just restate the
function signature in front of _coerce_buffer_targets and
_broadcast_locations. Tighten the _coerce_location docstring to lead
with the conversion intent rather than restate the type annotation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_memory/_managed_location.py    | 5 ++---
 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py
index e081a8da32..8d1605153f 100644
--- a/cuda_core/cuda/core/_memory/_managed_location.py
+++ b/cuda_core/cuda/core/_memory/_managed_location.py
@@ -52,10 +52,9 @@ def host_numa_current(cls) -> "Location":
 
 
 def _coerce_location(value, *, allow_none: bool = False) -> Location | None:
-    """Coerce user input to a Location instance.
+    """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``.
 
-    Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device,
-    -1 → host), None (only if allow_none=True).
+    Maps int ``-1`` to host and other non-negative ints to that device ordinal.
     """
     from cuda.core._device import Device  # avoid import cycle at module load
 
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index f4e13ef16e..90e5611a2d 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -88,7 +88,6 @@ cdef void _require_managed_buffer(Buffer self, str what):
         raise ValueError(f"{what} requires a managed-memory allocation")
 
 
-# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...].
 cdef tuple _coerce_buffer_targets(object targets, str what):
     cdef list out
     if isinstance(targets, Buffer):
@@ -110,8 +109,6 @@ cdef tuple _coerce_buffer_targets(object targets, str what):
     )
 
 
-# Broadcast a single location across ``n`` targets, or coerce a length-N
-# sequence elementwise.
 cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what):
     cdef object coerced
     if isinstance(location, (list, tuple)):

From ee967583b78d014723db47b9cc4b145bf9c031fa Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 18:41:19 -0700
Subject: [PATCH 30/31] chore(cuda.core): satisfy pre-commit hooks

- ruff auto-applied:
  * Drop unused `_managed_memory_ops` test import (no longer needed
    after the legacy-bindings monkeypatch test was deleted)
  * Drop "Location" string-quoted forward refs in
    _managed_location.py (file already uses `from __future__ import
    annotations`)
  * Reformat string concatenations and add blank-line-after-import
    spacing
- cython-lint auto-applied:
  * Drop unused libc.stdint cimport of `uintptr_t`
  * Drop unused `Location` Python import (only used in docstrings)
  * Drop unused `n` local in `discard()`
  * Move `cpython.mem cimport` of PyMem_Free / PyMem_Malloc inside
    the `IF CUDA_CORE_BUILD_MAJOR >= 13:` block where the symbols
    are actually used; cython-lint cannot see across compile-time
    branches.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../cuda/core/_memory/_managed_location.py    | 16 +++---
 .../cuda/core/_memory/_managed_memory_ops.pyx |  7 ++-
 cuda_core/tests/test_memory.py                | 51 +++++++++++++++----
 3 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py
index 8d1605153f..0e89cb92e3 100644
--- a/cuda_core/cuda/core/_memory/_managed_location.py
+++ b/cuda_core/cuda/core/_memory/_managed_location.py
@@ -35,19 +35,19 @@ def __post_init__(self) -> None:
                 raise ValueError(f"{self.kind} location must have id=None")
 
     @classmethod
-    def device(cls, device_id: int) -> "Location":
+    def device(cls, device_id: int) -> Location:
         return cls(kind="device", id=device_id)
 
     @classmethod
-    def host(cls) -> "Location":
+    def host(cls) -> Location:
         return cls(kind="host", id=None)
 
     @classmethod
-    def host_numa(cls, numa_id: int) -> "Location":
+    def host_numa(cls, numa_id: int) -> Location:
         return cls(kind="host_numa", id=numa_id)
 
     @classmethod
-    def host_numa_current(cls) -> "Location":
+    def host_numa_current(cls) -> Location:
         return cls(kind="host_numa_current", id=None)
 
 
@@ -71,9 +71,5 @@ def _coerce_location(value, *, allow_none: bool = False) -> Location | None:
             return Location.host()
         if value >= 0:
             return Location.device(value)
-        raise ValueError(
-            f"device ordinal must be >= 0 (or -1 for host), got {value}"
-        )
-    raise TypeError(
-        f"location must be a Location, Device, int, or None; got {type(value).__name__}"
-    )
+        raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}")
+    raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}")
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
index 90e5611a2d..9926cbe67f 100644
--- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -4,8 +4,8 @@
 
 from __future__ import annotations
 
-from cpython.mem cimport PyMem_Free, PyMem_Malloc
-from libc.stdint cimport uintptr_t
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    from cpython.mem cimport PyMem_Free, PyMem_Malloc
 
 from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer
@@ -14,7 +14,7 @@ from cuda.core._stream cimport Stream, Stream_accept
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from cuda.core._utils.cuda_utils import driver
-from cuda.core._memory._managed_location import Location, _coerce_location
+from cuda.core._memory._managed_location import _coerce_location
 
 
 cdef dict _MANAGED_ADVICE_ALIASES = {
@@ -182,7 +182,6 @@ def discard(
             f"discard options must be None (reserved); got {type(options).__name__}"
         )
     cdef tuple bufs = _coerce_buffer_targets(targets, "discard")
-    cdef Py_ssize_t n = len(bufs)
     cdef Stream s = Stream_accept(stream)
 
     cdef Buffer buf
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index a469c63a10..36fdfd0347 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -44,7 +44,7 @@
     system as ccx_system,
 )
 from cuda.core._dlpack import DLDeviceType
-from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops
+from cuda.core._memory import IPCBufferDescriptor
 from cuda.core._utils.cuda_utils import CUDAError, handle_return
 from cuda.core.utils import StridedMemoryView
 
@@ -1389,6 +1389,7 @@ def test_managed_memory_operation_validation(init_cuda):
     with pytest.raises(ValueError, match="location is required"):
         managed_memory.prefetch(buffer, stream=stream)
     from cuda.core.managed_memory import Location
+
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
         managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL))
 
@@ -1875,42 +1876,50 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
 class TestLocation:
     def test_device_constructor(self):
         from cuda.core.managed_memory import Location
+
         loc = Location.device(0)
         assert loc.kind == "device"
         assert loc.id == 0
 
     def test_host_constructor(self):
         from cuda.core.managed_memory import Location
+
         loc = Location.host()
         assert loc.kind == "host"
         assert loc.id is None
 
     def test_host_numa_constructor(self):
         from cuda.core.managed_memory import Location
+
         loc = Location.host_numa(3)
         assert loc.kind == "host_numa"
         assert loc.id == 3
 
     def test_host_numa_current_constructor(self):
         from cuda.core.managed_memory import Location
+
         loc = Location.host_numa_current()
         assert loc.kind == "host_numa_current"
         assert loc.id is None
 
     def test_frozen(self):
         import dataclasses
+
         from cuda.core.managed_memory import Location
+
         loc = Location.device(0)
         with pytest.raises(dataclasses.FrozenInstanceError):
             loc.id = 1
 
     def test_invalid_device_id(self):
         from cuda.core.managed_memory import Location
+
         with pytest.raises(ValueError, match="device id must be >= 0"):
             Location.device(-1)
 
     def test_invalid_kind(self):
         from cuda.core.managed_memory import Location
+
         with pytest.raises(ValueError, match="kind must be one of"):
             Location(kind="not_a_kind", id=None)
 
@@ -1919,21 +1928,25 @@ class TestLocationCoerce:
     def test_passthrough(self):
         from cuda.core._memory._managed_location import _coerce_location
         from cuda.core.managed_memory import Location
+
         loc = Location.device(0)
         assert _coerce_location(loc) is loc
 
     def test_int_device(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         assert _coerce_location(0).kind == "device"
         assert _coerce_location(0).id == 0
 
     def test_int_minus_one_is_host(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         assert _coerce_location(-1).kind == "host"
 
     def test_device_object(self, init_cuda):
         from cuda.core import Device
         from cuda.core._memory._managed_location import _coerce_location
+
         dev = Device()
         loc = _coerce_location(dev)
         assert loc.kind == "device"
@@ -1941,20 +1954,24 @@ def test_device_object(self, init_cuda):
 
     def test_none_when_disallowed(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         with pytest.raises(ValueError, match="location is required"):
             _coerce_location(None, allow_none=False)
 
     def test_none_when_allowed(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         assert _coerce_location(None, allow_none=True) is None
 
     def test_bad_int(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         with pytest.raises(ValueError, match="device ordinal"):
             _coerce_location(-2)
 
     def test_bad_type(self):
         from cuda.core._memory._managed_location import _coerce_location
+
         with pytest.raises(TypeError, match="Location, Device, int, or None"):
             _coerce_location("device")
 
@@ -1962,6 +1979,7 @@ def test_bad_type(self):
 class TestPrefetch:
     def test_single_with_location_host(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -1980,6 +1998,7 @@ def test_single_with_location_host(self, init_cuda):
 
     def test_batched_same_location(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemPrefetchBatchAsync"):
@@ -2002,6 +2021,7 @@ def test_batched_same_location(self, init_cuda):
 
     def test_batched_per_buffer_location(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemPrefetchBatchAsync"):
@@ -2029,6 +2049,7 @@ def test_batched_per_buffer_location(self, init_cuda):
 
     def test_length_mismatch(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -2043,6 +2064,7 @@ def test_length_mismatch(self, init_cuda):
 
     def test_rejects_non_managed(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         device.set_current()
         buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
@@ -2053,6 +2075,7 @@ def test_rejects_non_managed(self, init_cuda):
 
     def test_location_required(self, init_cuda):
         from cuda.core.managed_memory import prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -2065,6 +2088,7 @@ def test_location_required(self, init_cuda):
 
     def test_options_must_be_none(self, init_cuda):
         from cuda.core.managed_memory import Location, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -2079,6 +2103,7 @@ def test_options_must_be_none(self, init_cuda):
 class TestDiscard:
     def test_single_buffer(self, init_cuda):
         from cuda.core.managed_memory import Location, discard, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemDiscardBatchAsync"):
@@ -2095,6 +2120,7 @@ def test_single_buffer(self, init_cuda):
 
     def test_batched(self, init_cuda):
         from cuda.core.managed_memory import Location, discard, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemDiscardBatchAsync"):
@@ -2112,6 +2138,7 @@ def test_batched(self, init_cuda):
 
     def test_rejects_non_managed(self, init_cuda):
         from cuda.core.managed_memory import discard
+
         device = Device()
         device.set_current()
         buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
@@ -2122,6 +2149,7 @@ def test_rejects_non_managed(self, init_cuda):
 
     def test_options_must_be_none(self, init_cuda):
         from cuda.core.managed_memory import discard
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -2136,6 +2164,7 @@ def test_options_must_be_none(self, init_cuda):
 class TestDiscardPrefetch:
     def test_single_buffer(self, init_cuda):
         from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
@@ -2159,6 +2188,7 @@ def test_single_buffer(self, init_cuda):
 
     def test_batched_same_location(self, init_cuda):
         from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
@@ -2181,6 +2211,7 @@ def test_batched_same_location(self, init_cuda):
 
     def test_length_mismatch(self, init_cuda):
         from cuda.core.managed_memory import Location, discard_prefetch
+
         device = Device()
         skip_if_managed_memory_unsupported(device)
         device.set_current()
@@ -2194,6 +2225,7 @@ def test_length_mismatch(self, init_cuda):
 
     def test_rejects_non_managed(self, init_cuda):
         from cuda.core.managed_memory import Location, discard_prefetch
+
         device = Device()
         device.set_current()
         buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
@@ -2205,14 +2237,12 @@ def test_rejects_non_managed(self, init_cuda):
 
 class TestAdvise:
     def test_batched_same_advice(self, init_cuda):
-        from cuda.core.managed_memory import advise, Location
+        from cuda.core.managed_memory import advise
+
         device = Device()
         _skip_if_managed_location_ops_unsupported(device)
         device.set_current()
-        bufs = [
-            DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-            for _ in range(2)
-        ]
+        bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
         advise(bufs, "set_read_mostly")
         for buf in bufs:
             assert (
@@ -2225,14 +2255,12 @@ def test_batched_same_advice(self, init_cuda):
             buf.close()
 
     def test_batched_per_buffer_location(self, init_cuda):
-        from cuda.core.managed_memory import advise, Location
+        from cuda.core.managed_memory import Location, advise
+
         device = Device()
         _skip_if_managed_location_ops_unsupported(device)
         device.set_current()
-        bufs = [
-            DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
-            for _ in range(2)
-        ]
+        bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
         advise(
             bufs,
             "set_preferred_location",
@@ -2243,6 +2271,7 @@ def test_batched_per_buffer_location(self, init_cuda):
 
     def test_options_must_be_none(self, init_cuda):
         from cuda.core.managed_memory import advise
+
         device = Device()
         _skip_if_managed_allocation_unsupported(device)
         device.set_current()

From d6f60f247a8572de41a2abfc20d872898bdf71f8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 27 Apr 2026 19:08:33 -0700
Subject: [PATCH 31/31] refactor(cuda.core): move managed_memory ops to
 cuda.core.utils

Per Leo's review request (https://github.com/NVIDIA/cuda-python/pull/1775#discussion_r2991209111),
fold the managed-memory free functions and the Location dataclass into
cuda.core.utils rather than maintaining a dedicated cuda.core.managed_memory
namespace.

- Re-export Location, advise, prefetch, discard, discard_prefetch from
  cuda.core.utils.
- Delete cuda.core.managed_memory module.
- Update cuda.core.__init__ to drop the managed_memory submodule import.
- Update tests to import from cuda.core.utils.
- Update api.rst: drop the dedicated Managed memory section; add the
  managed-memory entries to the Utility functions section.
- Update 1.0.0-notes.rst accordingly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/__init__.py               |   2 +-
 cuda_core/cuda/core/managed_memory.py         |  10 --
 cuda_core/cuda/core/utils.py                  |   9 +-
 cuda_core/docs/source/api.rst                 |  23 +---
 cuda_core/docs/source/release/1.0.0-notes.rst |  22 ++--
 cuda_core/tests/test_memory.py                | 108 +++++++++---------
 6 files changed, 79 insertions(+), 95 deletions(-)
 delete mode 100644 cuda_core/cuda/core/managed_memory.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 61315dda5a..dfd52accea 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -28,7 +28,7 @@ def _import_versioned_module():
 del _import_versioned_module
 
 
-from cuda.core import managed_memory, system, utils
+from cuda.core import system, utils
 from cuda.core._device import Device
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py
deleted file mode 100644
index 509e874ccc..0000000000
--- a/cuda_core/cuda/core/managed_memory.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Managed-memory range operations."""
-
-from cuda.core._memory._managed_location import Location
-from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch
-
-__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"]
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
index f15d924277..3d4b3e4c59 100644
--- a/cuda_core/cuda/core/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -1,7 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core._memory._managed_location import Location  # noqa: F401
+from cuda.core._memory._managed_memory_ops import (
+    advise,  # noqa: F401
+    discard,  # noqa: F401
+    discard_prefetch,  # noqa: F401
+    prefetch,  # noqa: F401
+)
 from cuda.core._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index fd0e01dedf..fa17624fa5 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -129,24 +129,6 @@ Each subclass exposes attributes unique to its operation type.
    graph.SwitchNode
 
 
-.. module:: cuda.core.managed_memory
-
-Managed memory
---------------
-
-.. autosummary::
-   :toctree: generated/
-
-   Location
-   advise
-   prefetch
-   discard
-   discard_prefetch
-
-.. module:: cuda.core
-   :no-index:
-
-
 Graphics interoperability
 -------------------------
 
@@ -265,7 +247,12 @@ Utility functions
    :toctree: generated/
 
    args_viewable_as_strided_memory
+   advise
+   prefetch
+   discard
+   discard_prefetch
 
    :template: autosummary/cyclass.rst
 
+   Location
    StridedMemoryView
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 25e9066761..17696b616a 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -16,17 +16,17 @@ Highlights
 New features
 ------------
 
-- Added managed-memory range operations under :mod:`cuda.core.managed_memory`:
-  :class:`~managed_memory.Location`, :func:`~managed_memory.advise`,
-  :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and
-  :func:`~managed_memory.discard_prefetch`. Each operation accepts either a
-  single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+
-  the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver
-  entry point, addressing the managed-memory portion of #1333. Locations
-  are expressed via the typed :class:`~managed_memory.Location` dataclass
-  (with classmethod constructors ``device``, ``host``, ``host_numa``, and
-  ``host_numa_current``); ``Device`` and ``int`` values are still accepted
-  for ergonomic compatibility.
+- Added managed-memory range operations to :mod:`cuda.core.utils`:
+  :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`,
+  :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each
+  operation accepts either a single managed :class:`Buffer` or a
+  sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the
+  corresponding ``cuMem*BatchAsync`` driver entry point, addressing the
+  managed-memory portion of #1333. Locations are expressed via the typed
+  :class:`~utils.Location` dataclass (with classmethod constructors
+  ``device``, ``host``, ``host_numa``, and ``host_numa_current``);
+  ``Device`` and ``int`` values are still accepted for ergonomic
+  compatibility.
 
 
 Fixes and enhancements
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 36fdfd0347..18f7bed114 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -38,7 +38,7 @@
     PinnedMemoryResourceOptions,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
-    managed_memory,
+    utils,
 )
 from cuda.core import (
     system as ccx_system,
@@ -1243,7 +1243,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
     buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
     stream.sync()
     last_location = _get_int_mem_range_attr(
         buffer,
@@ -1251,7 +1251,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
     )
     assert last_location == _HOST_LOCATION_ID
 
-    managed_memory.prefetch(buffer, device, stream=stream)
+    utils.prefetch(buffer, device, stream=stream)
     stream.sync()
     last_location = _get_int_mem_range_attr(
         buffer,
@@ -1263,7 +1263,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
 
 
 def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
-    from cuda.core.managed_memory import Location
+    from cuda.core.utils import Location
 
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
@@ -1271,7 +1271,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
 
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
 
-    managed_memory.advise(buffer, "set_read_mostly")
+    utils.advise(buffer, "set_read_mostly")
     assert (
         _get_int_mem_range_attr(
             buffer,
@@ -1282,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
 
     # cuda.bindings currently exposes the combined location attributes for
     # cuMemRangeGetAttribute, so use the legacy location query here.
-    managed_memory.advise(buffer, "set_preferred_location", Location.host())
+    utils.advise(buffer, "set_preferred_location", Location.host())
     preferred_location = _get_int_mem_range_attr(
         buffer,
         driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
@@ -1300,7 +1300,7 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    managed_memory.prefetch(buffer, device, stream=stream)
+    utils.prefetch(buffer, device, stream=stream)
     stream.sync()
 
     last_location = _get_int_mem_range_attr(
@@ -1322,10 +1322,10 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_
     buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
     stream.sync()
 
-    managed_memory.discard_prefetch(buffer, device, stream=stream)
+    utils.discard_prefetch(buffer, device, stream=stream)
     stream.sync()
 
     last_location = _get_int_mem_range_attr(
@@ -1345,10 +1345,10 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
     stream = device.create_stream()
 
-    managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
     stream.sync()
 
-    managed_memory.discard_prefetch(buffer, device, stream=stream)
+    utils.discard_prefetch(buffer, device, stream=stream)
     stream.sync()
 
     last_location = _get_int_mem_range_attr(
@@ -1368,11 +1368,11 @@ def test_managed_memory_operations_reject_non_managed_allocations(init_cuda):
     stream = device.create_stream()
 
     with pytest.raises(ValueError, match="managed-memory allocation"):
-        managed_memory.advise(buffer, "set_read_mostly")
+        utils.advise(buffer, "set_read_mostly")
     with pytest.raises(ValueError, match="managed-memory allocation"):
-        managed_memory.prefetch(buffer, device, stream=stream)
+        utils.prefetch(buffer, device, stream=stream)
     with pytest.raises(ValueError, match="managed-memory allocation"):
-        managed_memory.discard_prefetch(buffer, device, stream=stream)
+        utils.discard_prefetch(buffer, device, stream=stream)
 
     buffer.close()
 
@@ -1387,18 +1387,18 @@ def test_managed_memory_operation_validation(init_cuda):
     stream = device.create_stream()
 
     with pytest.raises(ValueError, match="location is required"):
-        managed_memory.prefetch(buffer, stream=stream)
-    from cuda.core.managed_memory import Location
+        utils.prefetch(buffer, stream=stream)
+    from cuda.core.utils import Location
 
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
-        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL))
+        utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL))
 
     buffer.close()
 
 
 def test_managed_memory_advise_location_validation(init_cuda):
     """Verify doc-specified location constraints for each advice kind."""
-    from cuda.core.managed_memory import Location
+    from cuda.core.utils import Location
 
     device = Device()
     _skip_if_managed_location_ops_unsupported(device)
@@ -1407,25 +1407,25 @@ def test_managed_memory_advise_location_validation(init_cuda):
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
 
     # set_read_mostly works without a location (location is ignored)
-    managed_memory.advise(buffer, "set_read_mostly")
+    utils.advise(buffer, "set_read_mostly")
 
     # set_preferred_location requires a location; device ordinal works
-    managed_memory.advise(buffer, "set_preferred_location", device.device_id)
+    utils.advise(buffer, "set_preferred_location", device.device_id)
 
     # set_preferred_location with host location
-    managed_memory.advise(buffer, "set_preferred_location", Location.host())
+    utils.advise(buffer, "set_preferred_location", Location.host())
 
     # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs)
     with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
-        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0))
+        utils.advise(buffer, "set_accessed_by", Location.host_numa(0))
 
     # set_accessed_by with host_numa_current also raises ValueError
     with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"):
-        managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current())
+        utils.advise(buffer, "set_accessed_by", Location.host_numa_current())
 
     # Inferred location from int: -1 maps to host, 0 maps to device
-    managed_memory.advise(buffer, "set_preferred_location", -1)
-    managed_memory.advise(buffer, "set_preferred_location", 0)
+    utils.advise(buffer, "set_preferred_location", -1)
+    utils.advise(buffer, "set_preferred_location", 0)
 
     buffer.close()
 
@@ -1439,7 +1439,7 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda):
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
 
     advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY
-    managed_memory.advise(buffer, advice_enum)
+    utils.advise(buffer, advice_enum)
 
     assert (
         _get_int_mem_range_attr(
@@ -1461,10 +1461,10 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda):
     buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
 
     with pytest.raises(ValueError, match="advice must be one of"):
-        managed_memory.advise(buffer, "not_a_real_advice")
+        utils.advise(buffer, "not_a_real_advice")
 
     with pytest.raises(TypeError, match="advice must be"):
-        managed_memory.advise(buffer, 42)
+        utils.advise(buffer, 42)
 
     buffer.close()
 
@@ -1875,28 +1875,28 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
 
 class TestLocation:
     def test_device_constructor(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.device(0)
         assert loc.kind == "device"
         assert loc.id == 0
 
     def test_host_constructor(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.host()
         assert loc.kind == "host"
         assert loc.id is None
 
     def test_host_numa_constructor(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.host_numa(3)
         assert loc.kind == "host_numa"
         assert loc.id == 3
 
     def test_host_numa_current_constructor(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.host_numa_current()
         assert loc.kind == "host_numa_current"
@@ -1905,20 +1905,20 @@ def test_host_numa_current_constructor(self):
     def test_frozen(self):
         import dataclasses
 
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.device(0)
         with pytest.raises(dataclasses.FrozenInstanceError):
             loc.id = 1
 
     def test_invalid_device_id(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         with pytest.raises(ValueError, match="device id must be >= 0"):
             Location.device(-1)
 
     def test_invalid_kind(self):
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         with pytest.raises(ValueError, match="kind must be one of"):
             Location(kind="not_a_kind", id=None)
@@ -1927,7 +1927,7 @@ def test_invalid_kind(self):
 class TestLocationCoerce:
     def test_passthrough(self):
         from cuda.core._memory._managed_location import _coerce_location
-        from cuda.core.managed_memory import Location
+        from cuda.core.utils import Location
 
         loc = Location.device(0)
         assert _coerce_location(loc) is loc
@@ -1978,7 +1978,7 @@ def test_bad_type(self):
 
 class TestPrefetch:
     def test_single_with_location_host(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -1997,7 +1997,7 @@ def test_single_with_location_host(self, init_cuda):
         buf.close()
 
     def test_batched_same_location(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2020,7 +2020,7 @@ def test_batched_same_location(self, init_cuda):
             buf.close()
 
     def test_batched_per_buffer_location(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2048,7 +2048,7 @@ def test_batched_per_buffer_location(self, init_cuda):
             buf.close()
 
     def test_length_mismatch(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2063,7 +2063,7 @@ def test_length_mismatch(self, init_cuda):
             buf.close()
 
     def test_rejects_non_managed(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         device.set_current()
@@ -2074,7 +2074,7 @@ def test_rejects_non_managed(self, init_cuda):
         buf.close()
 
     def test_location_required(self, init_cuda):
-        from cuda.core.managed_memory import prefetch
+        from cuda.core.utils import prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2087,7 +2087,7 @@ def test_location_required(self, init_cuda):
         buf.close()
 
     def test_options_must_be_none(self, init_cuda):
-        from cuda.core.managed_memory import Location, prefetch
+        from cuda.core.utils import Location, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2102,7 +2102,7 @@ def test_options_must_be_none(self, init_cuda):
 
 class TestDiscard:
     def test_single_buffer(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard, prefetch
+        from cuda.core.utils import Location, discard, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2119,7 +2119,7 @@ def test_single_buffer(self, init_cuda):
         buf.close()
 
     def test_batched(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard, prefetch
+        from cuda.core.utils import Location, discard, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2137,7 +2137,7 @@ def test_batched(self, init_cuda):
             buf.close()
 
     def test_rejects_non_managed(self, init_cuda):
-        from cuda.core.managed_memory import discard
+        from cuda.core.utils import discard
 
         device = Device()
         device.set_current()
@@ -2148,7 +2148,7 @@ def test_rejects_non_managed(self, init_cuda):
         buf.close()
 
     def test_options_must_be_none(self, init_cuda):
-        from cuda.core.managed_memory import discard
+        from cuda.core.utils import discard
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2163,7 +2163,7 @@ def test_options_must_be_none(self, init_cuda):
 
 class TestDiscardPrefetch:
     def test_single_buffer(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+        from cuda.core.utils import Location, discard_prefetch, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2187,7 +2187,7 @@ def test_single_buffer(self, init_cuda):
         buf.close()
 
     def test_batched_same_location(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard_prefetch, prefetch
+        from cuda.core.utils import Location, discard_prefetch, prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2210,7 +2210,7 @@ def test_batched_same_location(self, init_cuda):
             buf.close()
 
     def test_length_mismatch(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard_prefetch
+        from cuda.core.utils import Location, discard_prefetch
 
         device = Device()
         skip_if_managed_memory_unsupported(device)
@@ -2224,7 +2224,7 @@ def test_length_mismatch(self, init_cuda):
             buf.close()
 
     def test_rejects_non_managed(self, init_cuda):
-        from cuda.core.managed_memory import Location, discard_prefetch
+        from cuda.core.utils import Location, discard_prefetch
 
         device = Device()
         device.set_current()
@@ -2237,7 +2237,7 @@ def test_rejects_non_managed(self, init_cuda):
 
 class TestAdvise:
     def test_batched_same_advice(self, init_cuda):
-        from cuda.core.managed_memory import advise
+        from cuda.core.utils import advise
 
         device = Device()
         _skip_if_managed_location_ops_unsupported(device)
@@ -2255,7 +2255,7 @@ def test_batched_same_advice(self, init_cuda):
             buf.close()
 
     def test_batched_per_buffer_location(self, init_cuda):
-        from cuda.core.managed_memory import Location, advise
+        from cuda.core.utils import Location, advise
 
         device = Device()
         _skip_if_managed_location_ops_unsupported(device)
@@ -2270,7 +2270,7 @@ def test_batched_per_buffer_location(self, init_cuda):
             buf.close()
 
     def test_options_must_be_none(self, init_cuda):
-        from cuda.core.managed_memory import advise
+        from cuda.core.utils import advise
 
         device = Device()
         _skip_if_managed_allocation_unsupported(device)