From abdec47f2c3de514a02d14f08fffe3fc097ed729 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:37:49 -0700 Subject: [PATCH 01/31] wip --- cuda_core/cuda/core/_memory/_buffer.pxd | 1 + cuda_core/cuda/core/_memory/_buffer.pyx | 284 ++++++++++++++++++ cuda_core/docs/source/release/0.7.x-notes.rst | 5 + cuda_core/tests/test_memory.py | 127 ++++++++ 4 files changed, 417 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 91c0cfe24a..04b5707e18 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -12,6 +12,7 @@ cdef struct _MemAttrs: int device_id bint is_device_accessible bint is_host_accessible + bint is_managed cdef class Buffer: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 83009f74ae..686585b527 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -72,6 +72,194 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`. """ + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", + "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( + "set_accessed_by", + "unset_accessed_by", +)) + + +cdef inline object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + if not hasattr(driver.CUmemLocationType, attr_name): + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return getattr(driver.CUmemLocationType, attr_name) + + +cdef inline object _make_managed_location(str location_type, int location_id): + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + elif location_type == "host_numa_current": + location.id = 0 + else: + location.id = location_id + return location + + +cdef inline tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + if alias.startswith("cu_mem_advise_"): + continue + if advice == getattr(driver.CUmem_advise, attr_name): + return alias, advice + raise ValueError(f"Unsupported advice value: {advice!r}") + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef inline object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + bint allow_host=True, + bint allow_host_numa=True, + bint allow_host_numa_current=True, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = (location).device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + loc_type = "host" + elif loc_id >= 0: + loc_type = "device" + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + if not allow_host: + raise ValueError(f"{what} does not support location_type='host'") + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, 0) + + if loc_type == "host" and not allow_host: + raise ValueError(f"{what} does not support host locations") + if loc_type == "host_numa" and not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if loc_type == "host_numa_current" and not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + return _make_managed_location(loc_type, loc_id) + + +cdef inline void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory buffer") + + +cdef inline void _require_managed_discard_prefetch_support(): + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + raise RuntimeError( + "Buffer.discard_prefetch requires cuda.bindings support for " + "cuMemDiscardAndPrefetchBatchAsync" + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -293,6 +481,99 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) + def advise( + self, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + location_type: str | None = None, + ): + """Apply a managed-memory advice to this buffer. + + This method is only valid for buffers backed by managed memory. + + Parameters + ---------- + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, + ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + _require_managed_buffer(self, "Buffer.advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "Buffer.advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + + def prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Prefetch this managed-memory buffer to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.prefetch") + location = _normalize_managed_location( + location, + location_type, + "Buffer.prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + + def discard_prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Discard this managed-memory buffer and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.discard_prefetch") + _require_managed_discard_prefetch_support() + location = _normalize_managed_location( + location, + location_type, + "Buffer.discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [self.handle], + [self._size], + 1, + [location], + [0], + 1, + 0, + s.handle, + ) + ) + def __dlpack__( self, *, @@ -453,6 +734,7 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 + out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -461,10 +743,12 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id + out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id + out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 98551603b6..18b3bede36 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,6 +35,11 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. +- Added managed-memory controls on :class:`Buffer`: ``advise()``, + ``prefetch()``, and ``discard_prefetch()``. These methods validate that the + underlying allocation is managed memory and then forward to the corresponding + CUDA driver operations for range advice and migration. + - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and ``numa_id`` is not set, the NUMA node is automatically derived from the diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0473d2d183..dd146785ec 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1134,6 +1134,133 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +def _get_mem_range_attr(buffer, attribute, data_size): + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + buffer.advise("set_read_mostly") + assert _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) == 1 + + buffer.advise("set_preferred_location", device, location_type="device") + preferred_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, + 4, + ) + preferred_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + 4, + ) + assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert preferred_id == device.device_id + + buffer.prefetch(-1, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + + buffer.discard_prefetch(device, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) + stream.sync() + + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.advise("set_read_mostly") + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.discard_prefetch(device, stream=stream) + + buffer.close() + + +def test_managed_buffer_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="requires a location"): + buffer.prefetch(stream=stream) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + buffer.advise("set_accessed_by", 0, location_type="host_numa") + with pytest.raises(ValueError, match="location must be None or -1"): + buffer.prefetch(0, stream=stream, location_type="host") + + buffer.close() + + def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch From c418050043ef38cc15a74e733d9038d564068c0d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:44:49 -0700 Subject: [PATCH 02/31] wip --- cuda_core/tests/test_memory.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index dd146785ec..44d50e356c 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1151,11 +1151,14 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): stream = device.create_stream() buffer.advise("set_read_mostly") - assert _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, - ) == 1 + assert ( + _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) + == 1 + ) buffer.advise("set_preferred_location", device, location_type="device") preferred_type = _get_mem_range_attr( From b879fa5b13922b2a41122f31751cd11c0c1fbaee Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:51:36 -0700 Subject: [PATCH 03/31] fixing ci compiler errors --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 686585b527..05a1667b3f 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -175,7 +175,7 @@ cdef inline object _normalize_managed_location( cdef int loc_id if isinstance(location, Device): - location = (location).device_id + location = location.device_id if location_type is not None and not isinstance(location_type, str): raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") From 04ee3de1859c91158f30a7bffd3246024d422f0e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:07:10 -0700 Subject: [PATCH 04/31] skipping tests that aren't supported --- cuda_core/tests/test_memory.py | 130 ++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 44d50e356c..95c6e6e964 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1138,18 +1138,70 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) -def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): pytest.skip("discard-prefetch requires cuda.bindings support") + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() buffer = mr.allocate(4096) stream = device.create_stream() + buffer.prefetch(-1, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == -1 + + buffer.prefetch(device, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer.advise("set_read_mostly") assert ( _get_mem_range_attr( @@ -1160,70 +1212,60 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): == 1 ) - buffer.advise("set_preferred_location", device, location_type="device") - preferred_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, - 4, - ) - preferred_id = _get_mem_range_attr( + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + buffer.advise("set_preferred_location", location_type="host") + preferred_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, 4, ) - assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert preferred_id == device.device_id + assert preferred_location == -1 - buffer.prefetch(-1, stream=stream) - stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + buffer.close() - buffer.discard_prefetch(device, stream=stream) + +def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() -def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): +def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() - skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(4096) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + buffer.prefetch(-1, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + buffer.discard_prefetch(device, stream=stream) + stream.sync() + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() From 9ab3f465d1c7d072a6dd9c6b8b70a9b47a24f3d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:34:29 -0700 Subject: [PATCH 05/31] cu12 support --- cuda_core/cuda/core/_memory/_buffer.pyx | 40 ++++++++++++++++++-- cuda_core/tests/test_memory.py | 50 ++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 05a1667b3f..4460de900d 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return from cuda.core._device import Device @@ -247,6 +247,20 @@ cdef inline object _normalize_managed_location( return _make_managed_location(loc_type, loc_id) +cdef inline bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + return get_binding_version() >= (13, 0) + + +cdef inline int _managed_location_to_legacy_device(object location, str what): + cdef object loc_type = location.type + if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: @@ -518,7 +532,17 @@ cdef class Buffer: allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, allow_host_numa_current=advice_name == "set_preferred_location", ) - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + self.handle, + self._size, + advice, + _managed_location_to_legacy_device(location, "Buffer.advise"), + ) + ) def prefetch( self, @@ -539,7 +563,17 @@ cdef class Buffer: allow_host_numa=True, allow_host_numa_current=True, ) - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + else: + handle_return( + driver.cuMemPrefetchAsync( + self.handle, + self._size, + _managed_location_to_legacy_device(location, "Buffer.prefetch"), + s.handle, + ) + ) def discard_prefetch( self, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 95c6e6e964..380b581e7b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -43,7 +43,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor +from cuda.core._memory import IPCBufferDescriptor, _buffer from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1270,6 +1270,54 @@ def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(i buffer.close() +def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + calls = [] + + def fake_cuMemAdvise(ptr, size, advice, location): + calls.append((ptr, size, advice, location)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + + buffer.advise("set_read_mostly") + + assert len(calls) == 1 + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + + buffer.close() + + +def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + calls = [] + + def fake_cuMemPrefetchAsync(ptr, size, location, hstream): + calls.append((ptr, size, location, hstream)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + + buffer.prefetch(device, stream=stream) + + assert len(calls) == 1 + assert calls[0][2] == device.device_id + assert int(calls[0][3]) == int(stream.handle) + + buffer.close() + + def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): device = Device() device.set_current() From a948066ab2fc6fda3dfb74516538091e96e68746 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:45:51 -0700 Subject: [PATCH 06/31] Moving to function from Buffer class methods to free standing functions in the cuda.core.managed_memory namespace --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/_memory/_buffer.pyx | 322 +++++++++++------- cuda_core/cuda/core/experimental/__init__.py | 3 +- cuda_core/cuda/core/managed_memory.py | 9 + cuda_core/docs/source/api.rst | 13 + cuda_core/docs/source/release/0.7.x-notes.rst | 10 +- cuda_core/pixi.lock | 18 +- .../test_experimental_backward_compat.py | 7 + cuda_core/tests/test_memory.py | 137 +++++--- 9 files changed, 335 insertions(+), 186 deletions(-) create mode 100644 cuda_core/cuda/core/managed_memory.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 139078e86e..c55c0786ed 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graph import ( diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 8ae6d22ee5..4663302b34 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -113,6 +113,13 @@ cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( "unset_accessed_by", )) +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] @@ -130,7 +137,7 @@ cdef inline object _make_managed_location(str location_type, int location_id): if location_type == "host": location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) elif location_type == "host_numa_current": - location.id = 0 + location.id = _HOST_NUMA_CURRENT_ID else: location.id = location_id return location @@ -236,7 +243,7 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" ) - return _make_managed_location(loc_type, 0) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) if loc_type == "host" and not allow_host: raise ValueError(f"{what} does not support host locations") @@ -264,16 +271,206 @@ cdef inline int _managed_location_to_legacy_device(object location, str what): cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory buffer") + raise ValueError(f"{what} requires a managed-memory allocation") -cdef inline void _require_managed_discard_prefetch_support(): +cdef inline void _require_managed_discard_prefetch_support(str what): if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): raise RuntimeError( - "Buffer.discard_prefetch requires cuda.bindings support for " - "cuMemDiscardAndPrefetchBatchAsync" + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) + +cdef inline tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef inline tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + batch_ptr = driver.CUdeviceptr(int(ptr)) + _require_managed_discard_prefetch_support("discard_prefetch") + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -502,119 +699,6 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) - def advise( - self, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - location_type: str | None = None, - ): - """Apply a managed-memory advice to this buffer. - - This method is only valid for buffers backed by managed memory. - - Parameters - ---------- - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, - ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - _require_managed_buffer(self, "Buffer.advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "Buffer.advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - self.handle, - self._size, - advice, - _managed_location_to_legacy_device(location, "Buffer.advise"), - ) - ) - - def prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Prefetch this managed-memory buffer to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.prefetch") - location = _normalize_managed_location( - location, - location_type, - "Buffer.prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) - else: - handle_return( - driver.cuMemPrefetchAsync( - self.handle, - self._size, - _managed_location_to_legacy_device(location, "Buffer.prefetch"), - s.handle, - ) - ) - - def discard_prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Discard this managed-memory buffer and prefetch it to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.discard_prefetch") - _require_managed_discard_prefetch_support() - location = _normalize_managed_location( - location, - location_type, - "Buffer.discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [self.handle], - [self._size], - 1, - [location], - [0], - 1, - 0, - s.handle, - ) - ) - def __dlpack__( self, *, diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index e7989f0f26..83fb1c7581 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,9 +38,10 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils # Make utils accessible as a submodule for backward compatibility +__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py new file mode 100644 index 0000000000..f11aabcd19 --- /dev/null +++ b/cuda_core/cuda/core/managed_memory.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Managed-memory range operations.""" + +from cuda.core._memory._buffer import advise, discard_prefetch, prefetch + +__all__ = ["advise", "prefetch", "discard_prefetch"] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fa7ce48eb5..4d63bbcf88 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -62,6 +62,19 @@ CUDA runtime on other non-blocking streams. +.. module:: cuda.core.managed_memory + +Managed memory +-------------- + +.. autosummary:: + :toctree: generated/ + + advise + prefetch + discard_prefetch + + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 18b3bede36..186e3181f1 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,10 +35,12 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. -- Added managed-memory controls on :class:`Buffer`: ``advise()``, - ``prefetch()``, and ``discard_prefetch()``. These methods validate that the - underlying allocation is managed memory and then forward to the corresponding - CUDA driver operations for range advice and migration. +- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: + ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free + functions accept either a managed :class:`Buffer` or a raw pointer plus + ``size=``, validate that the target allocation is managed memory, and then + forward to the corresponding CUDA driver operations for range advice and + migration. - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 78da9addb5..e2f8b7b0c2 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -2598,7 +2598,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2625,7 +2625,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2653,7 +2653,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2794,7 +2794,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2817,7 +2817,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2840,7 +2840,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2862,7 +2862,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2884,7 +2884,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2906,7 +2906,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index c3215b056a..82e2cdd5be 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,6 +38,7 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") + assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -73,6 +74,7 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons + assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -88,6 +90,11 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None + from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch + + assert advise is not None + assert prefetch is not None + assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 380b581e7b..927014826a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,6 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, + managed_memory, ) from cuda.core import ( system as ccx_system, @@ -48,6 +49,12 @@ from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 +_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1138,6 +1145,10 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + def _skip_if_managed_allocation_unsupported(device): try: if not device.properties.managed_memory: @@ -1165,140 +1176,134 @@ def _skip_if_managed_discard_prefetch_unsupported(device): pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") -def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) - assert last_location == -1 + assert last_location == _HOST_LOCATION_ID - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert ( - _get_mem_range_attr( + _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, ) - == 1 + == _READ_MOSTLY_ENABLED ) # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - buffer.advise("set_preferred_location", location_type="host") - preferred_location = _get_mem_range_attr( + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, - 4, ) - assert preferred_location == -1 + assert preferred_location == _HOST_LOCATION_ID buffer.close() -def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - buffer.discard_prefetch(device, stream=stream) + managed_memory.discard_prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) calls = [] def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert len(calls) == 1 - assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) buffer.close() -def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() calls = [] @@ -1306,10 +1311,10 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) assert len(calls) == 1 assert calls[0][2] == device.device_id @@ -1318,38 +1323,66 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): buffer.close() -def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() - buffer = DummyDeviceMemoryResource(device).allocate(4096) + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.advise("set_read_mostly") - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.prefetch(device, stream=stream) - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.discard_prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.discard_prefetch(buffer, device, stream=stream) buffer.close() -def test_managed_buffer_operation_validation(init_cuda): +def test_managed_memory_operation_validation(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(ValueError, match="requires a location"): - buffer.prefetch(stream=stream) + managed_memory.prefetch(buffer, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - buffer.advise("set_accessed_by", 0, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") with pytest.raises(ValueError, match="location must be None or -1"): - buffer.prefetch(0, stream=stream, location_type="host") + managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") + + buffer.close() + + +def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id buffer.close() From 14575991d65ca85973a4f1dc61f068efc4fc3293 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:46:20 -0700 Subject: [PATCH 07/31] precommit format --- cuda_core/cuda/core/managed_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f11aabcd19..f5bb09c13d 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -6,4 +6,4 @@ from cuda.core._memory._buffer import advise, discard_prefetch, prefetch -__all__ = ["advise", "prefetch", "discard_prefetch"] +__all__ = ["advise", "discard_prefetch", "prefetch"] From acb402478cac58689f069e0836819b2e91010c09 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 17:30:41 -0700 Subject: [PATCH 08/31] iterating on implementation --- cuda_bindings/pixi.lock | 86 ++++++++++++------------- cuda_core/cuda/core/_memory/_buffer.pyx | 63 ++++++++++++++---- cuda_core/tests/test_memory.py | 85 ++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 56 deletions(-) diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index b01d6eec69..237a169580 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -1081,21 +1081,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.2.20-h7938cbb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-12.9.27-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-12.9.86-h69a702a_6.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.3-py314h1807b08_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda @@ -1134,7 +1134,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -1160,8 +1160,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-12.9.82-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.9.86-hecca717_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2025.2.0-hb617929_1.conda @@ -1264,7 +1264,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - conda: . - build: py314hb727236_0 + build: py314ha6d028f_0 - conda: ../cuda_pathfinder linux-aarch64: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2 @@ -1460,21 +1460,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/conda-gcc-specs-15.2.0-hd546029_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-12.9.86-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-12.9.86-h719f0c7_6.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-13.2.27-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.2.51-h719f0c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.2.20-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.3-py314h344ed54_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda @@ -1520,8 +1520,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-h2466b09_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-12.9.82-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-12.9.86-hac47afa_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.2.51-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6-h6a83c73_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.53-h7351971_0.conda @@ -1583,7 +1583,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda - conda: . - build: py314h5e6f764_0 + build: py314h356c398_0 - conda: ../cuda_pathfinder packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -2154,7 +2154,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2182,7 +2182,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2209,7 +2209,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2237,7 +2237,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2265,7 +2265,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2293,7 +2293,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 4663302b34..829e05b3ad 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -205,9 +205,11 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - loc_type = "host" + if not allow_host: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) elif loc_id >= 0: - loc_type = "device" + return _make_managed_location("device", loc_id) else: raise ValueError( f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" @@ -245,23 +247,22 @@ cdef inline object _normalize_managed_location( ) return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - if loc_type == "host" and not allow_host: - raise ValueError(f"{what} does not support host locations") - if loc_type == "host_numa" and not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") - if loc_type == "host_numa_current" and not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") - return _make_managed_location(loc_type, loc_id) - cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. return get_binding_version() >= (13, 0) +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + cdef inline int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") cdef object loc_type = location.type - if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: return location.id raise RuntimeError( f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" @@ -396,7 +397,25 @@ def prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Prefetch a managed-memory allocation range to a target location.""" + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef size_t nbytes @@ -440,7 +459,25 @@ def discard_prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Discard a managed-memory allocation range and prefetch it to a target location.""" + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 927014826a..ea827818ac 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1359,6 +1359,91 @@ def test_managed_memory_operation_validation(init_cuda): buffer.close() +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + managed_memory.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + managed_memory.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location_type + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + + # Inferred location from int: -1 maps to host, 0 maps to device + managed_memory.advise(buffer, "set_preferred_location", -1) + managed_memory.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + managed_memory.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): + """advise() raises TypeError when size= is given with a Buffer target.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(TypeError, match="does not accept size="): + managed_memory.advise(buffer, "set_read_mostly", size=1024) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + managed_memory.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + managed_memory.advise(buffer, 42) + + buffer.close() + + def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) From d10ab07e2f402628b83b08e07d95da39c4f2b634 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 18:13:36 -0700 Subject: [PATCH 09/31] Simplify managed-memory helpers: remove long-form aliases, cache lookups, fix docs - Remove duplicate long-form "cu_mem_advise_*" string aliases from _MANAGED_ADVICE_ALIASES; users pass short strings or the enum directly - Replace 4 boolean allow_* params in _normalize_managed_location with a single allowed_loctypes frozenset driven by _MANAGED_ADVICE_ALLOWED_LOCTYPES - Cache immutable runtime checks: CU_DEVICE_CPU, v2 bindings flag, discard_prefetch support, and advice enum-to-alias reverse map - Collapse hasattr+getattr to single getattr in _managed_location_enum - Move _require_managed_discard_prefetch_support to top of discard_prefetch for fail-fast behavior - Fix docs build: reset Sphinx module scope after managed_memory section in api.rst so subsequent sections resolve under cuda.core - Add discard_prefetch pool-allocation test and comment on _get_mem_range_attr Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 94 ++++++++++++++----------- cuda_core/docs/source/api.rst | 2 + cuda_core/tests/test_memory.py | 26 +++++++ 3 files changed, 79 insertions(+), 43 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 829e05b3ad..d280b4ea2b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -89,17 +89,11 @@ cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { cdef dict _MANAGED_ADVICE_ALIASES = { "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", - "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", } cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( @@ -108,10 +102,18 @@ cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( "unset_preferred_location", )) -cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( - "set_accessed_by", - "unset_accessed_by", -)) +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 @@ -120,22 +122,32 @@ cdef size_t _SINGLE_RANGE_COUNT = 1 cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +cdef int _V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - if not hasattr(driver.CUmemLocationType, attr_name): + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: raise RuntimeError( f"Managed-memory location type {location_type!r} is not supported by the " f"installed cuda.bindings package." ) - return getattr(driver.CUmemLocationType, attr_name) + return result cdef inline object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU cdef object location = driver.CUmemLocation() location.type = _managed_location_enum(location_type) if location_type == "host": - location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU elif location_type == "host_numa_current": location.id = _HOST_NUMA_CURRENT_ID else: @@ -157,12 +169,17 @@ cdef inline tuple _normalize_managed_advice(object advice): return alias, getattr(driver.CUmem_advise, attr_name) if isinstance(advice, driver.CUmem_advise): - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - if alias.startswith("cu_mem_advise_"): - continue - if advice == getattr(driver.CUmem_advise, attr_name): - return alias, advice - raise ValueError(f"Unsupported advice value: {advice!r}") + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice raise TypeError( "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" @@ -174,9 +191,7 @@ cdef inline object _normalize_managed_location( object location_type, str what, bint allow_none=False, - bint allow_host=True, - bint allow_host_numa=True, - bint allow_host_numa_current=True, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, ): cdef object loc_type cdef int loc_id @@ -194,6 +209,9 @@ cdef inline object _normalize_managed_location( f"or None, got {location_type!r}" ) + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + if loc_type is None: if location is None: if allow_none: @@ -205,7 +223,7 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - if not allow_host: + if "host" not in allowed_loctypes: raise ValueError(f"{what} does not support host locations") return _make_managed_location("host", -1) elif loc_id >= 0: @@ -227,20 +245,14 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None or -1 when location_type is 'host', got {location!r}" ) - if not allow_host: - raise ValueError(f"{what} does not support location_type='host'") return _make_managed_location(loc_type, -1) elif loc_type == "host_numa": - if not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") if not isinstance(location, int) or location < 0: raise ValueError( f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" ) return _make_managed_location(loc_type, location) else: - if not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") if location is not None: raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" @@ -250,7 +262,10 @@ cdef inline object _normalize_managed_location( cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - return get_binding_version() >= (13, 0) + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 cdef object _LEGACY_LOC_DEVICE = None @@ -276,7 +291,10 @@ cdef inline void _require_managed_buffer(Buffer self, str what): cdef inline void _require_managed_discard_prefetch_support(str what): - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: raise RuntimeError( f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) @@ -372,9 +390,7 @@ def advise( location_type, "advise", allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], ) if _managed_location_uses_v2_bindings(): handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) @@ -425,10 +441,6 @@ def prefetch( location, location_type, "prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) if _managed_location_uses_v2_bindings(): handle_return( @@ -478,6 +490,7 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr @@ -485,15 +498,10 @@ def discard_prefetch( ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") batch_ptr = driver.CUdeviceptr(int(ptr)) - _require_managed_discard_prefetch_support("discard_prefetch") location = _normalize_managed_location( location, location_type, "discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) handle_return( driver.cuMemDiscardAndPrefetchBatchAsync( diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 4d63bbcf88..7bf59ae495 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -74,6 +74,8 @@ Managed memory prefetch discard_prefetch +.. module:: cuda.core + :no-index: CUDA compilation toolchain -------------------------- diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index ea827818ac..5296ea344a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1142,6 +1142,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) @@ -1252,6 +1253,31 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer.close() +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + managed_memory.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) From c250c92e47393fa6cb0e6611245c5a4dd0c3b6cf Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 09:21:11 -0700 Subject: [PATCH 10/31] fix(test): reset _V2_BINDINGS cache so legacy-signature tests take the legacy path The _V2_BINDINGS cache in _buffer.pyx persists across tests, so monkeypatching get_binding_version alone is insufficient when earlier tests have already populated the cache with the v2 value. Promote _V2_BINDINGS from cdef int to a Python-level variable so tests can monkeypatch it directly via monkeypatch.setattr, and reset it to -1 in both legacy-signature tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- cuda_core/tests/test_memory.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 6f5809e06c..d109de2ac4 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -124,7 +124,7 @@ cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 # Lazily cached values for immutable runtime properties. cdef object _CU_DEVICE_CPU = None cdef dict _ADVICE_ENUM_TO_ALIAS = None -cdef int _V2_BINDINGS = -1 +_V2_BINDINGS = -1 cdef int _DISCARD_PREFETCH_SUPPORTED = -1 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9cd3209d8d..411a3c6cb5 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1314,6 +1314,7 @@ def fake_cuMemAdvise(ptr, size, advice, location): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,6 +1339,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 89329d9c6eff581445b4806fe0217e598a2313fa Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 10:18:41 -0700 Subject: [PATCH 11/31] fix(test): require concurrent_managed_access for advise tests that hit real hardware These three tests call cuMemAdvise on real CUDA devices and verify memory range attributes. On devices without concurrent_managed_access (e.g. Windows/WDDM), set_read_mostly silently no-ops and set_preferred_location fails with CUDA_ERROR_INVALID_DEVICE. Use the stricter _skip_if_managed_location_ops_unsupported guard, matching the pattern already used by test_managed_memory_functions_accept_raw_pointer_ranges. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/tests/test_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 411a3c6cb5..56c505fbe6 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1207,7 +1207,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1390,7 +1390,7 @@ def test_managed_memory_operation_validation(init_cuda): def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1422,7 +1422,7 @@ def test_managed_memory_advise_location_validation(init_cuda): def test_managed_memory_advise_accepts_enum_value(init_cuda): """advise() accepts CUmem_advise enum values directly, not just string aliases.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) From 8a75d1bf1f1172e4681bb232a22f00ff9567d5d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 11:23:53 -0700 Subject: [PATCH 12/31] fix: validate managed buffer before checking discard_prefetch bindings support Reorder checks in discard_prefetch so _normalize_managed_target_range runs before _require_managed_discard_prefetch_support. This ensures non-managed buffers raise ValueError before the RuntimeError for missing cuMemDiscardAndPrefetchBatchAsync support. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index d109de2ac4..ffd82facb5 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -489,13 +489,13 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr cdef size_t nbytes ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, From 9e9b1e0914d30f855389a349cf8d41d134b1c4dc Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:08:24 -0700 Subject: [PATCH 13/31] refactor: extract managed memory ops into dedicated _managed_memory_ops module Move advise, prefetch, and discard_prefetch functions and their helpers out of _buffer.pyx into a new _managed_memory_ops Cython module to improve separation of concerns. Expose _init_mem_attrs and _query_memory_attrs as non-inline cdef functions in _buffer.pxd so the new module can reuse them. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pxd | 8 + cuda_core/cuda/core/_memory/_buffer.pyx | 449 +---------------- .../cuda/core/_memory/_managed_memory_ops.pxd | 6 + .../cuda/core/_memory/_managed_memory_ops.pyx | 458 ++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 2 +- cuda_core/tests/test_memory.py | 14 +- 6 files changed, 483 insertions(+), 454 deletions(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 04b5707e18..9065da77eb 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle( MemoryResource mr, object ipc_descriptor = * ) + +# Memory attribute query helpers (used by _managed_memory_ops) +cdef void _init_mem_attrs(Buffer self) +cdef int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr, +) except -1 nogil diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index ffd82facb5..104252a62b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -72,449 +72,6 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting """ -cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( - "device", - "host", - "host_numa", - "host_numa_current", -) - -cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { - "device": "CU_MEM_LOCATION_TYPE_DEVICE", - "host": "CU_MEM_LOCATION_TYPE_HOST", - "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", - "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", -} - -cdef dict _MANAGED_ADVICE_ALIASES = { - "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", -} - -cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( - "set_read_mostly", - "unset_read_mostly", - "unset_preferred_location", -)) - -cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) -cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) -cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) - -cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { - "set_read_mostly": _DEVICE_HOST_NUMA, - "unset_read_mostly": _DEVICE_HOST_NUMA, - "set_preferred_location": _ALL_LOCATION_TYPES, - "unset_preferred_location": _DEVICE_HOST_NUMA, - "set_accessed_by": _DEVICE_HOST_ONLY, - "unset_accessed_by": _DEVICE_HOST_ONLY, -} - -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 -cdef int _HOST_NUMA_CURRENT_ID = 0 -cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 -cdef size_t _SINGLE_RANGE_COUNT = 1 -cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 -cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 - -# Lazily cached values for immutable runtime properties. -cdef object _CU_DEVICE_CPU = None -cdef dict _ADVICE_ENUM_TO_ALIAS = None -_V2_BINDINGS = -1 -cdef int _DISCARD_PREFETCH_SUPPORTED = -1 - - -cdef inline object _managed_location_enum(str location_type): - cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - cdef object result = getattr(driver.CUmemLocationType, attr_name, None) - if result is None: - raise RuntimeError( - f"Managed-memory location type {location_type!r} is not supported by the " - f"installed cuda.bindings package." - ) - return result - - -cdef inline object _make_managed_location(str location_type, int location_id): - global _CU_DEVICE_CPU - cdef object location = driver.CUmemLocation() - location.type = _managed_location_enum(location_type) - if location_type == "host": - if _CU_DEVICE_CPU is None: - _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) - location.id = _CU_DEVICE_CPU - elif location_type == "host_numa_current": - location.id = _HOST_NUMA_CURRENT_ID - else: - location.id = location_id - return location - - -cdef inline tuple _normalize_managed_advice(object advice): - cdef str alias - cdef str attr_name - if isinstance(advice, str): - alias = advice.lower() - attr_name = _MANAGED_ADVICE_ALIASES.get(alias) - if attr_name is None: - raise ValueError( - "advice must be one of " - f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" - ) - return alias, getattr(driver.CUmem_advise, attr_name) - - if isinstance(advice, driver.CUmem_advise): - global _ADVICE_ENUM_TO_ALIAS - if _ADVICE_ENUM_TO_ALIAS is None: - _ADVICE_ENUM_TO_ALIAS = {} - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - enum_val = getattr(driver.CUmem_advise, attr_name, None) - if enum_val is not None: - _ADVICE_ENUM_TO_ALIAS[enum_val] = alias - alias = _ADVICE_ENUM_TO_ALIAS.get(advice) - if alias is None: - raise ValueError(f"Unsupported advice value: {advice!r}") - return alias, advice - - raise TypeError( - "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" - ) - - -cdef inline object _normalize_managed_location( - object location, - object location_type, - str what, - bint allow_none=False, - frozenset allowed_loctypes=_ALL_LOCATION_TYPES, -): - cdef object loc_type - cdef int loc_id - - if isinstance(location, Device): - location = location.device_id - - if location_type is not None and not isinstance(location_type, str): - raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") - - loc_type = None if location_type is None else (location_type).lower() - if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: - raise ValueError( - f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " - f"or None, got {location_type!r}" - ) - - if loc_type is not None and loc_type not in allowed_loctypes: - raise ValueError(f"{what} does not support location_type='{loc_type}'") - - if loc_type is None: - if location is None: - if allow_none: - return _make_managed_location("host", -1) - raise ValueError(f"{what} requires a location") - if not isinstance(location, int): - raise TypeError( - f"{what} location must be a Device, int, or None, got {type(location).__name__}" - ) - loc_id = location - if loc_id == -1: - if "host" not in allowed_loctypes: - raise ValueError(f"{what} does not support host locations") - return _make_managed_location("host", -1) - elif loc_id >= 0: - return _make_managed_location("device", loc_id) - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" - ) - elif loc_type == "device": - if isinstance(location, int) and location >= 0: - loc_id = location - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" - ) - return _make_managed_location(loc_type, loc_id) - elif loc_type == "host": - if location not in (None, -1): - raise ValueError( - f"{what} location must be None or -1 when location_type is 'host', got {location!r}" - ) - return _make_managed_location(loc_type, -1) - elif loc_type == "host_numa": - if not isinstance(location, int) or location < 0: - raise ValueError( - f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" - ) - return _make_managed_location(loc_type, location) - else: - if location is not None: - raise ValueError( - f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" - ) - return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - - -cdef inline bint _managed_location_uses_v2_bindings(): - # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - global _V2_BINDINGS - if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 - return _V2_BINDINGS != 0 - - -cdef object _LEGACY_LOC_DEVICE = None -cdef object _LEGACY_LOC_HOST = None - -cdef inline int _managed_location_to_legacy_device(object location, str what): - global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST - if _LEGACY_LOC_DEVICE is None: - _LEGACY_LOC_DEVICE = _managed_location_enum("device") - _LEGACY_LOC_HOST = _managed_location_enum("host") - cdef object loc_type = location.type - if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: - return location.id - raise RuntimeError( - f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" - ) - - -cdef inline void _require_managed_buffer(Buffer self, str what): - _init_mem_attrs(self) - if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - - -cdef inline void _require_managed_discard_prefetch_support(str what): - global _DISCARD_PREFETCH_SUPPORTED - if _DISCARD_PREFETCH_SUPPORTED < 0: - _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 - if not _DISCARD_PREFETCH_SUPPORTED: - raise RuntimeError( - f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" - ) - - -cdef inline tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef inline tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - -def advise( - target, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Apply managed-memory advice to an allocation range. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - ptr, - nbytes, - advice, - _managed_location_to_legacy_device(location, "advise"), - ) - ) - - -def prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Prefetch a managed-memory allocation range to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") - location = _normalize_managed_location( - location, - location_type, - "prefetch", - ) - if _managed_location_uses_v2_bindings(): - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - location, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - else: - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - _managed_location_to_legacy_device(location, "prefetch"), - s.handle, - ) - ) - - -def discard_prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Discard a managed-memory allocation range and prefetch it to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for discard_prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) - location = _normalize_managed_location( - location, - location_type, - "discard_prefetch", - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [batch_ptr], - [nbytes], - _SINGLE_RANGE_COUNT, - [location], - [_FIRST_PREFETCH_LOCATION_INDEX], - _SINGLE_PREFETCH_LOCATION_COUNT, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - cdef class Buffer: """Represent a handle to allocated memory. @@ -864,14 +421,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef inline void _init_mem_attrs(Buffer self): +cdef void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef inline int _query_memory_attrs( +cdef int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd new file mode 100644 index 0000000000..a7019c784d --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Managed-memory operation helpers (advise, prefetch, discard_prefetch). +# The public API is exposed via def functions; no cdef declarations needed. diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx new file mode 100644 index 0000000000..649c2cbe72 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -0,0 +1,458 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._stream cimport Stream, Stream_accept + +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._device import Device + + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} + +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +_V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + + +cdef object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return result + + +cdef object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU + elif location_type == "host_numa_current": + location.id = _HOST_NUMA_CURRENT_ID + else: + location.id = location_id + return location + + +cdef tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = location.device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + if "host" not in allowed_loctypes: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) + elif loc_id >= 0: + return _make_managed_location("device", loc_id) + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) + + +cdef bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 + + +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + +cdef int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") + cdef object loc_type = location.type + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + +cdef void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + + +cdef void _require_managed_discard_prefetch_support(str what): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: + raise RuntimeError( + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" + ) + + +cdef tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) + batch_ptr = driver.CUdeviceptr(int(ptr)) + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f5bb09c13d..005c9ec3cf 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -4,6 +4,6 @@ """Managed-memory range operations.""" -from cuda.core._memory._buffer import advise, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch __all__ = ["advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 56c505fbe6..544b7afc03 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -44,7 +44,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor, _buffer +from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1313,9 +1313,9 @@ def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,9 +1338,9 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 90f07117615a25b45baf9722c3c1f0835c85d1c5 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:16:38 -0700 Subject: [PATCH 14/31] pre-commit fix --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 104252a62b..e47f3f4926 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device From b4d252cdb5a8899d775db185d0cc9ec92c9cd474 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 11:07:46 -0700 Subject: [PATCH 15/31] Removing blank file --- cuda_core/cuda/core/_memory/_managed_memory_ops.pxd | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd deleted file mode 100644 index a7019c784d..0000000000 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -# Managed-memory operation helpers (advise, prefetch, discard_prefetch). -# The public API is exposed via def functions; no cdef declarations needed. From faaa1d881363eb4ea5d3d13cf0a21b433cdcd61f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 13:15:08 -0700 Subject: [PATCH 16/31] wip --- .../cuda/core/_memory/_managed_memory_ops.pyx | 117 +++++------------- cuda_core/tests/test_memory.py | 42 ------- 2 files changed, 29 insertions(+), 130 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 649c2cbe72..04dc33ed75 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,10 +4,7 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t - -from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return @@ -56,7 +53,6 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 cdef size_t _SINGLE_RANGE_COUNT = 1 @@ -241,71 +237,19 @@ cdef void _require_managed_discard_prefetch_support(str what): ) -cdef tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - def advise( - target, + target: Buffer, advice: driver.CUmem_advise | str, location: Device | int | None = None, *, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Apply managed-memory advice to an allocation range. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. advice : :obj:`~driver.CUmem_advise` | str Managed-memory advice to apply. String aliases such as ``"set_read_mostly"``, ``"set_preferred_location"``, and @@ -314,17 +258,18 @@ def advise( Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None`` for advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "advise") cdef str advice_name - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") advice_name, advice = _normalize_managed_advice(advice) location = _normalize_managed_location( location, @@ -347,37 +292,36 @@ def advise( def prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Prefetch a managed-memory allocation range to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") location = _normalize_managed_location( location, location_type, @@ -405,40 +349,37 @@ def prefetch( def discard_prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Discard a managed-memory allocation range and prefetch it to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for discard_prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + if not isinstance(target, Buffer): + raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "discard_prefetch") _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size + cdef object batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, location_type, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 544b7afc03..dbb5ac6d8c 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1441,20 +1441,6 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda): buffer.close() -def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): - """advise() raises TypeError when size= is given with a Buffer target.""" - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - with pytest.raises(TypeError, match="does not accept size="): - managed_memory.advise(buffer, "set_read_mostly", size=1024) - - buffer.close() - - def test_managed_memory_advise_invalid_advice_values(init_cuda): """advise() rejects invalid advice strings and wrong types.""" device = Device() @@ -1472,34 +1458,6 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer.close() -def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) - stream.sync() - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch From cf2f20d1be323b8cd31f76125dffad959cf0b947 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:46:30 -0700 Subject: [PATCH 17/31] fix(cuda.core): update binding_version import after upstream merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream renamed get_binding_version → binding_version and moved it from cuda.core._utils.cuda_utils to cuda.core._utils.version. Update the managed-memory ops module to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 04dc33ed75..81ff5582a6 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -7,7 +7,8 @@ from __future__ import annotations from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs from cuda.core._stream cimport Stream, Stream_accept -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.version import binding_version from cuda.core._device import Device @@ -201,7 +202,7 @@ cdef bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. global _V2_BINDINGS if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0 return _V2_BINDINGS != 0 From db3bac2e042ff07b6ab37f510f2fe06bc1cbc598 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:46:36 -0700 Subject: [PATCH 18/31] revert: drop managed_memory shim in cuda.core.experimental The cuda.core.experimental namespace is being deprecated and should not gain new submodules. Per review feedback, the managed_memory module should only be reachable via cuda.core.managed_memory, not via the experimental compatibility shim. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/experimental/__init__.py | 3 +-- cuda_core/tests/test_experimental_backward_compat.py | 7 ------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 34b442173b..f65e7852a9 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,10 +38,9 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import managed_memory, system, utils +from cuda.core import system, utils # Make utils accessible as a submodule for backward compatibility -__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index 82e2cdd5be..c3215b056a 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,7 +38,6 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") - assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -74,7 +73,6 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons - assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -90,11 +88,6 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None - from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch - - assert advise is not None - assert prefetch is not None - assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") From 20d036ebe1ae148222b4ad9e0fdca20502ed24de Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:59:58 -0700 Subject: [PATCH 19/31] feat(cuda.core): add Location dataclass for managed memory Frozen dataclass with classmethod constructors for the four CUmemLocationType kinds (device, host, host_numa, host_numa_current). Validates id constraints in __post_init__. Re-exported from cuda.core.managed_memory. This will replace the location=/location_type= kwargs in the upcoming unified 1..N managed-memory ops API. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 51 +++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 3 +- cuda_core/tests/test_memory.py | 43 ++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_location.py diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py new file mode 100644 index 0000000000..7e2515f573 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current") +LocationKind = Literal["device", "host", "host_numa", "host_numa_current"] + + +@dataclass(frozen=True) +class Location: + """Typed managed-memory location. + + Use the classmethod constructors (``device``, ``host``, ``host_numa``, + ``host_numa_current``) rather than constructing directly. + """ + + kind: LocationKind + id: int | None = None + + def __post_init__(self) -> None: + if self.kind not in _VALID_KINDS: + raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}") + if self.kind == "device": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("device id must be >= 0") + elif self.kind == "host_numa": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("host_numa id must be >= 0") + elif self.kind in ("host", "host_numa_current"): + if self.id is not None: + raise ValueError(f"{self.kind} location must have id=None") + + @classmethod + def device(cls, device_id: int) -> "Location": + return cls(kind="device", id=device_id) + + @classmethod + def host(cls) -> "Location": + return cls(kind="host", id=None) + + @classmethod + def host_numa(cls, numa_id: int) -> "Location": + return cls(kind="host_numa", id=numa_id) + + @classmethod + def host_numa_current(cls) -> "Location": + return cls(kind="host_numa_current", id=None) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index 005c9ec3cf..25191fe038 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -4,6 +4,7 @@ """Managed-memory range operations.""" +from cuda.core._memory._managed_location import Location from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch -__all__ = ["advise", "discard_prefetch", "prefetch"] +__all__ = ["Location", "advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 7ff15047e8..8b3db88b8d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1918,3 +1918,46 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): assert buffer.handle >= 0 assert buffer.size == 0 assert buffer.device_id == mr.device_id + + +class TestLocation: + def test_device_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.device(0) + assert loc.kind == "device" + assert loc.id == 0 + + def test_host_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host() + assert loc.kind == "host" + assert loc.id is None + + def test_host_numa_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host_numa(3) + assert loc.kind == "host_numa" + assert loc.id == 3 + + def test_host_numa_current_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host_numa_current() + assert loc.kind == "host_numa_current" + assert loc.id is None + + def test_frozen(self): + import dataclasses + from cuda.core.managed_memory import Location + loc = Location.device(0) + with pytest.raises(dataclasses.FrozenInstanceError): + loc.id = 1 + + def test_invalid_device_id(self): + from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="device id must be >= 0"): + Location.device(-1) + + def test_invalid_kind(self): + from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="kind must be one of"): + Location(kind="not_a_kind", id=None) From c2dae533f073fab65d81f6524be78d9c2e129d1e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:02:51 -0700 Subject: [PATCH 20/31] feat(cuda.core): add _coerce_location helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralizes back-compat coercion for managed-memory Location inputs: - Location → passthrough - Device → Location.device(device_id) - int >= 0 → Location.device(int) - int == -1 → Location.host() - None → None when allow_none=True, else ValueError Will be used by the unified 1..N managed-memory ops API. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 29 ++++++++++++ cuda_core/tests/test_memory.py | 44 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 7e2515f573..e081a8da32 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -49,3 +49,32 @@ def host_numa(cls, numa_id: int) -> "Location": @classmethod def host_numa_current(cls) -> "Location": return cls(kind="host_numa_current", id=None) + + +def _coerce_location(value, *, allow_none: bool = False) -> Location | None: + """Coerce user input to a Location instance. + + Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device, + -1 → host), None (only if allow_none=True). + """ + from cuda.core._device import Device # avoid import cycle at module load + + if isinstance(value, Location): + return value + if isinstance(value, Device): + return Location.device(value.device_id) + if value is None: + if allow_none: + return None + raise ValueError("location is required") + if isinstance(value, int): + if value == -1: + return Location.host() + if value >= 0: + return Location.device(value) + raise ValueError( + f"device ordinal must be >= 0 (or -1 for host), got {value}" + ) + raise TypeError( + f"location must be a Location, Device, int, or None; got {type(value).__name__}" + ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8b3db88b8d..bccc0fa67b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1961,3 +1961,47 @@ def test_invalid_kind(self): from cuda.core.managed_memory import Location with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) + + +class TestLocationCoerce: + def test_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location + from cuda.core.managed_memory import Location + loc = Location.device(0) + assert _coerce_location(loc) is loc + + def test_int_device(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(0).kind == "device" + assert _coerce_location(0).id == 0 + + def test_int_minus_one_is_host(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(-1).kind == "host" + + def test_device_object(self, init_cuda): + from cuda.core import Device + from cuda.core._memory._managed_location import _coerce_location + dev = Device() + loc = _coerce_location(dev) + assert loc.kind == "device" + assert loc.id == dev.device_id + + def test_none_when_disallowed(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="location is required"): + _coerce_location(None, allow_none=False) + + def test_none_when_allowed(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(None, allow_none=True) is None + + def test_bad_int(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="device ordinal"): + _coerce_location(-2) + + def test_bad_type(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(TypeError, match="Location, Device, int, or None"): + _coerce_location("device") From 935c8ba7b34a8c7e3afc391318d480baee23a551 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:18:37 -0700 Subject: [PATCH 21/31] test(cuda.core): update monkeypatch target after binding_version rename The legacy-bindings monkeypatch tests still referenced get_binding_version, which was renamed to binding_version in cf2f20d1be. Update both occurrences. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/test_memory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index bccc0fa67b..2304c370fd 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1371,7 +1371,7 @@ def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) @@ -1396,7 +1396,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) From dc4653513bc04d1ce1fe1214630fdf628f13ef8a Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:19:59 -0700 Subject: [PATCH 22/31] refactor(cuda.core): tighten memory-attr query Address review feedback on _buffer.pyx: - Restore `inline` on `_init_mem_attrs` and `_query_memory_attrs`. - Set `out.is_managed = (is_managed != 0)` once outside the if/elif, rather than per-branch (driver leaves the attribute zero for non-managed pointers, so all three branches converged on the same value anyway). - Add a TODO noting that HMM/ATS-enabled sysmem should also report `is_managed=True`; the CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet. The Cython modernization of _managed_memory_ops.pyx (cimport cydriver, IF/ELSE for the 12/13 ABI split) is folded into Tasks 5-8 where the public API is being rewritten anyway; doing it here would mean rewriting the same call sites twice. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 6c7f8ffd14..4ca8650e8d 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -422,14 +422,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef void _init_mem_attrs(Buffer self): +cdef inline void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef int _query_memory_attrs( +cdef inline int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: @@ -456,12 +456,15 @@ cdef int _query_memory_attrs( ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) HANDLE_RETURN(ret) + # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the + # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet. + out.is_managed = is_managed != 0 + if memory_type == 0: # unregistered host pointer out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 - out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -470,12 +473,10 @@ cdef int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id - out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id - out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") From 818f5d25d8416245b5f781d3d06b5c751337eaa6 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:53:12 -0700 Subject: [PATCH 23/31] feat(cuda.core): unified 1..N managed_memory.prefetch with cydriver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite prefetch() with the unified single-or-batched signature targeted by issue #1333: - prefetch(targets, location, *, options=None, stream) - targets accepts a single Buffer or a sequence of Buffers - location accepts a Location dataclass, Device, int (-1 = host), or a sequence broadcasting to per-buffer locations - length mismatch raises ValueError; empty targets raises ValueError - options is reserved for future per-call flags and must be None - stream moved to the end, kept keyword-only Internals: switch from Python-level driver.cuMemPrefetchAsync to Cython-level cydriver.cuMemPrefetchAsync via cimport cydriver, with HANDLE_RETURN. Replace the runtime _V2_BINDINGS check with compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE per the codebase precedent in _managed_memory_resource.pyx, _memory_pool.pyx, _tensor_map.pyx. N>1 dispatches to cydriver.cuMemPrefetchBatchAsync (CUDA 13 only); on CUDA 12 builds, batched prefetch raises NotImplementedError. Single-range prefetch continues to work on both CUDA 12 and 13 builds. The location_type= keyword is removed; callers express location kind via the Location dataclass added in 20d036ebe1. The advise() and discard_prefetch() functions still use the legacy _normalize_managed_location helper and Python-level driver calls; they will be migrated in their own tasks. Also drops test_managed_memory_prefetch_uses_legacy_bindings_signature, which monkeypatched the Python-level driver.cuMemPrefetchAsync — no longer applicable since the prefetch path uses cydriver. The corresponding advise legacy-bindings test stays for now (advise still uses Python driver). Closes Andy-Jost's review comment that the existing API is "non-Pythonic" by making it Pythonic in a different direction (typed Location dataclass) while preserving the free-function shape pending Leo's tie-break on ManagedBuffer subclass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 210 ++++++++++++++---- cuda_core/tests/test_memory.py | 147 +++++++++--- 2 files changed, 284 insertions(+), 73 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 81ff5582a6..b608b532ab 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,12 +4,19 @@ from __future__ import annotations +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs +from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._utils.version import binding_version from cuda.core._device import Device +from cuda.core._memory._managed_location import Location, _coerce_location cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( @@ -228,6 +235,74 @@ cdef void _require_managed_buffer(Buffer self, str what): raise ValueError(f"{what} requires a managed-memory allocation") +# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...]. +cdef tuple _coerce_buffer_targets(object targets, str what): + cdef list out + if isinstance(targets, Buffer): + return (targets,) + if isinstance(targets, (list, tuple)): + if not targets: + raise ValueError(f"{what}: empty targets sequence") + out = [] + for t in targets: + if not isinstance(t, Buffer): + raise TypeError( + f"{what}: each target must be a Buffer, got {type(t).__name__}" + ) + out.append(t) + return tuple(out) + raise TypeError( + f"{what}: targets must be a Buffer or sequence of Buffer, " + f"got {type(targets).__name__}" + ) + + +# Broadcast a single location across ``n`` targets, or coerce a length-N +# sequence elementwise. +cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): + cdef object coerced + if isinstance(location, (list, tuple)): + if len(location) != n: + raise ValueError( + f"{what}: location length {len(location)} does not match " + f"targets length {n}" + ) + return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) + coerced = _coerce_location(location, allow_none=allow_none) + return tuple([coerced] * n) + + +IF CUDA_CORE_BUILD_MAJOR >= 13: + # Convert a Location dataclass to a cydriver.CUmemLocation struct. + cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): + cdef cydriver.CUmemLocation out + cdef str kind = loc.kind + if kind == "device": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + out.id = loc.id + elif kind == "host": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + out.id = 0 + elif kind == "host_numa": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA + out.id = loc.id + else: # host_numa_current + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT + out.id = 0 + return out +ELSE: + # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host). + cdef inline int _to_legacy_device(object loc) except? -2: + cdef str kind = loc.kind + if kind == "device": + return loc.id + if kind == "host": + return -1 + raise RuntimeError( + f"location_type={kind!r} requires a CUDA 13 build of cuda.core" + ) + + cdef void _require_managed_discard_prefetch_support(str what): global _DISCARD_PREFETCH_SUPPORTED if _DISCARD_PREFETCH_SUPPORTED < 0: @@ -293,59 +368,106 @@ def advise( def prefetch( - target: Buffer, - location: Device | int | None = None, + targets, + location=None, *, - stream: Stream | GraphBuilder, - location_type: str | None = None, + options=None, + stream, ): - """Prefetch a managed-memory allocation range to a target location. + """Prefetch one or more managed-memory ranges to a target location. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous prefetch. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to operate on. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; a + sequence must match ``len(targets)``. ``Device`` and ``int`` values + are coerced to :class:`Location` (``-1`` maps to host). + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous prefetch (keyword-only). + + Raises + ------ + NotImplementedError + If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``. """ - if not isinstance(target, Buffer): - raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "prefetch") + if options is not None: + raise TypeError( + f"prefetch options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size - location = _normalize_managed_location( - location, - location_type, - "prefetch", - ) - if _managed_location_uses_v2_bindings(): - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - location, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "prefetch") + + if n == 1: + _do_single_prefetch(bufs[0], locs[0], s) else: - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - _managed_location_to_legacy_device(location, "prefetch"), - s.handle, - ) + _do_batch_prefetch(bufs, locs, s) + + +cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) + ELSE: + cdef int dev_int = _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) + + +cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "batched prefetch requires a CUDA 13 build of cuda.core" ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 2304c370fd..89c8fda1c0 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1383,32 +1383,6 @@ def fake_cuMemAdvise(ptr, size, advice, location): buffer.close() -def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - calls = [] - - def fake_cuMemPrefetchAsync(ptr, size, location, hstream): - calls.append((ptr, size, location, hstream)) - return (driver.CUresult.CUDA_SUCCESS,) - - monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) - monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) - - managed_memory.prefetch(buffer, device, stream=stream) - - assert len(calls) == 1 - assert calls[0][2] == device.device_id - assert int(calls[0][3]) == int(stream.handle) - - buffer.close() - - def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() @@ -1435,12 +1409,10 @@ def test_managed_memory_operation_validation(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="requires a location"): + with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") - with pytest.raises(ValueError, match="location must be None or -1"): - managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") buffer.close() @@ -2005,3 +1977,120 @@ def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location with pytest.raises(TypeError, match="Location, Device, int, or None"): _coerce_location("device") + + +class TestPrefetch: + def test_single_with_location_host(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == _HOST_LOCATION_ID + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) + stream.sync() + + last0 = _get_int_mem_range_attr( + bufs[0], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + last1 = _get_int_mem_range_attr( + bufs[1], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last0 == _HOST_LOCATION_ID + assert last1 == device.device_id + for buf in bufs: + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + with pytest.raises(ValueError, match="length"): + prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + prefetch(buf, Location.host(), stream=stream) + buf.close() + + def test_location_required(self, init_cuda): + from cuda.core.managed_memory import prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="location is required"): + prefetch(buf, None, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + prefetch(buf, Location.host(), options={}, stream=stream) + buf.close() From e296e72986b124dcbb07027e17160a5e0290b8b0 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:59:24 -0700 Subject: [PATCH 24/31] feat(cuda.core): add managed_memory.discard Adds a new discard(targets, *, options=None, stream) free function that wraps cuMemDiscardBatchAsync. Accepts a single Buffer or a sequence; N>=1 dispatches to the batched driver entry point. Requires a CUDA 13 build of cuda.core (NotImplementedError on CUDA 12 builds). Closes the second of three batched managed-memory operations from #1333: P1: cudaMemDiscardBatchAsync <- this commit P1: cudaMemPrefetchBatchAsync <- 818f5d25d8 P1: cudaMemDiscardAndPrefetchBatchAsync <- next commit Re-exported from cuda.core.managed_memory. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 71 +++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 4 +- cuda_core/tests/test_memory.py | 57 +++++++++++++++ 3 files changed, 130 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index b608b532ab..031b56a8af 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -313,6 +313,77 @@ cdef void _require_managed_discard_prefetch_support(str what): ) +def discard( + targets, + *, + options=None, + stream, +): + """Discard one or more managed-memory ranges. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard. Their resident pages + are released without prefetching new contents; subsequent access + is satisfied by lazy migration. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous discard (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+. + """ + if options is not None: + raise TypeError( + f"discard options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard") + cdef Py_ssize_t n = len(bufs) + cdef Stream s = Stream_accept(stream) + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard") + + _do_batch_discard(bufs, s) + + +cdef void _do_batch_discard(tuple bufs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes): + PyMem_Free(ptrs) + PyMem_Free(sizes) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( + ptrs, sizes, n, 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + ELSE: + raise NotImplementedError( + "discard requires a CUDA 13 build of cuda.core" + ) + + def advise( target: Buffer, advice: driver.CUmem_advise | str, diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index 25191fe038..509e874ccc 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -5,6 +5,6 @@ """Managed-memory range operations.""" from cuda.core._memory._managed_location import Location -from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch -__all__ = ["Location", "advise", "discard_prefetch", "prefetch"] +__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 89c8fda1c0..c18fa72519 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2094,3 +2094,60 @@ def test_options_must_be_none(self, init_cuda): with pytest.raises(TypeError, match="must be None"): prefetch(buf, Location.host(), options={}, stream=stream) buf.close() + + +class TestDiscard: + def test_single_buffer(self, init_cuda): + from cuda.core.managed_memory import Location, discard, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + discard(buf, stream=stream) + stream.sync() + buf.close() + + def test_batched(self, init_cuda): + from cuda.core.managed_memory import Location, discard, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + discard(bufs, stream=stream) + stream.sync() + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import discard + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard(buf, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import discard + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + discard(buf, options={}, stream=stream) + buf.close() From e697131defa9c65cce468b8f946e0f16f442744a Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:01:33 -0700 Subject: [PATCH 25/31] feat(cuda.core): unified 1..N managed_memory.discard_prefetch with cydriver Rewrite discard_prefetch() with the unified single-or-batched signature: discard_prefetch(targets, location, *, options=None, stream) - targets accepts a single Buffer or a sequence of Buffers - location accepts a Location, Device, int, or per-buffer sequence - length mismatch / empty targets raise ValueError - options must be None (reserved) - stream moved to end, kept keyword-only Internals: switch from Python-level driver.cuMemDiscardAndPrefetchBatchAsync to Cython-level cydriver.cuMemDiscardAndPrefetchBatchAsync. The runtime discard-prefetch availability check is replaced by compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE; on CUDA 12 builds the call raises NotImplementedError. The location_type= keyword is removed; use Location dataclass instead. Closes the third managed-memory batched op from #1333. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 117 ++++++++++++------ cuda_core/tests/test_memory.py | 70 +++++++++++ 2 files changed, 147 insertions(+), 40 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 031b56a8af..2192688320 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -543,51 +543,88 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): def discard_prefetch( - target: Buffer, - location: Device | int | None = None, + targets, + location=None, *, - stream: Stream | GraphBuilder, - location_type: str | None = None, + options=None, + stream, ): - """Discard a managed-memory allocation range and prefetch it to a target location. + """Discard one or more managed-memory ranges and prefetch them to a target location. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for discard_prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous operation. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard and re-prefetch. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; + a sequence must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous operation (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch + requires CUDA 13+. """ - if not isinstance(target, Buffer): - raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "discard_prefetch") - _require_managed_discard_prefetch_support("discard_prefetch") + if options is not None: + raise TypeError( + f"discard_prefetch options must be None (reserved); " + f"got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size - cdef object batch_ptr = driver.CUdeviceptr(int(ptr)) - location = _normalize_managed_location( - location, - location_type, - "discard_prefetch", - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [batch_ptr], - [nbytes], - _SINGLE_RANGE_COUNT, - [location], - [_FIRST_PREFETCH_LOCATION_INDEX], - _SINGLE_PREFETCH_LOCATION_COUNT, - _MANAGED_OPERATION_FLAGS, - s.handle, + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard_prefetch") + + _do_batch_discard_prefetch(bufs, locs, s) + + +cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "discard_prefetch requires a CUDA 13 build of cuda.core" ) - ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index c18fa72519..627a60bb3f 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2151,3 +2151,73 @@ def test_options_must_be_none(self, init_cuda): with pytest.raises(TypeError, match="must be None"): discard(buf, options={}, stream=stream) buf.close() + + +class TestDiscardPrefetch: + def test_single_buffer(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + discard_prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + prefetch(bufs, Location.host(), stream=stream) + stream.sync() + discard_prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + with pytest.raises(ValueError, match="length"): + discard_prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard_prefetch(buf, Location.host(), stream=stream) + buf.close() From 3bc10219dc3086d5449aa811e2f6086b73d915fb Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:08:40 -0700 Subject: [PATCH 26/31] feat(cuda.core): unified 1..N managed_memory.advise + drop legacy apparatus Rewrite advise() with the unified single-or-batched signature: advise(targets, advice, location=None, *, options=None) - targets accepts a single Buffer or a sequence - advice still accepts string aliases or driver.CUmem_advise enum values - location accepts Location dataclass, Device, int, None, or per-buffer sequence (None permitted only for set_read_mostly, unset_read_mostly, unset_preferred_location) - Per-advice allowed-kind validation ported to operate on Location.kind (matches CUDA driver constraints from existing tables) - options reserved for future per-call flags - For N>1, loops cydriver.cuMemAdvise per buffer (no batched advise API exists in CUDA) Internals: switch to cydriver.cuMemAdvise (Cython-level); use compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE for the 12/13 ABI split. Drop the legacy apparatus that all four functions previously shared: - _normalize_managed_location (returned Python driver.CUmemLocation) - _make_managed_location, _managed_location_enum - _managed_location_uses_v2_bindings + _V2_BINDINGS lazy cache - _managed_location_to_legacy_device + _LEGACY_LOC_DEVICE/HOST cache - _require_managed_discard_prefetch_support - Unused module-level constants (_HOST_NUMA_CURRENT_ID, _SINGLE_RANGE_COUNT, _MANAGED_OPERATION_FLAGS, etc.) Also drop test_managed_memory_advise_uses_legacy_bindings_signature and the _LEGACY_BINDINGS_VERSION constant; the runtime version switch is gone, replaced by compile-time IF/ELSE that the test could not exercise. The CUDA 12 vs CUDA 13 paths are now covered by the build-matrix CI job. Closes Task 8 (advise) and Task 9 (legacy-bindings test cleanup) from docs/superpowers/plans/2026-04-27-managed-memory-ops-batched.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 270 +++++------------- cuda_core/tests/test_memory.py | 91 ++++-- 2 files changed, 127 insertions(+), 234 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 2192688320..11236a1ecf 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -13,26 +13,10 @@ from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core._utils.cuda_utils import driver, handle_return -from cuda.core._utils.version import binding_version -from cuda.core._device import Device +from cuda.core._utils.cuda_utils import driver from cuda.core._memory._managed_location import Location, _coerce_location -cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( - "device", - "host", - "host_numa", - "host_numa_current", -) - -cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { - "device": "CU_MEM_LOCATION_TYPE_DEVICE", - "host": "CU_MEM_LOCATION_TYPE_HOST", - "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", - "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", -} - cdef dict _MANAGED_ADVICE_ALIASES = { "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", @@ -61,43 +45,8 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -cdef int _HOST_NUMA_CURRENT_ID = 0 -cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 -cdef size_t _SINGLE_RANGE_COUNT = 1 -cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 -cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 - -# Lazily cached values for immutable runtime properties. -cdef object _CU_DEVICE_CPU = None +# Lazily cached: maps driver.CUmem_advise enum value → string alias. cdef dict _ADVICE_ENUM_TO_ALIAS = None -_V2_BINDINGS = -1 -cdef int _DISCARD_PREFETCH_SUPPORTED = -1 - - -cdef object _managed_location_enum(str location_type): - cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - cdef object result = getattr(driver.CUmemLocationType, attr_name, None) - if result is None: - raise RuntimeError( - f"Managed-memory location type {location_type!r} is not supported by the " - f"installed cuda.bindings package." - ) - return result - - -cdef object _make_managed_location(str location_type, int location_id): - global _CU_DEVICE_CPU - cdef object location = driver.CUmemLocation() - location.type = _managed_location_enum(location_type) - if location_type == "host": - if _CU_DEVICE_CPU is None: - _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) - location.id = _CU_DEVICE_CPU - elif location_type == "host_numa_current": - location.id = _HOST_NUMA_CURRENT_ID - else: - location.id = location_id - return location cdef tuple _normalize_managed_advice(object advice): @@ -131,104 +80,6 @@ cdef tuple _normalize_managed_advice(object advice): ) -cdef object _normalize_managed_location( - object location, - object location_type, - str what, - bint allow_none=False, - frozenset allowed_loctypes=_ALL_LOCATION_TYPES, -): - cdef object loc_type - cdef int loc_id - - if isinstance(location, Device): - location = location.device_id - - if location_type is not None and not isinstance(location_type, str): - raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") - - loc_type = None if location_type is None else (location_type).lower() - if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: - raise ValueError( - f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " - f"or None, got {location_type!r}" - ) - - if loc_type is not None and loc_type not in allowed_loctypes: - raise ValueError(f"{what} does not support location_type='{loc_type}'") - - if loc_type is None: - if location is None: - if allow_none: - return _make_managed_location("host", -1) - raise ValueError(f"{what} requires a location") - if not isinstance(location, int): - raise TypeError( - f"{what} location must be a Device, int, or None, got {type(location).__name__}" - ) - loc_id = location - if loc_id == -1: - if "host" not in allowed_loctypes: - raise ValueError(f"{what} does not support host locations") - return _make_managed_location("host", -1) - elif loc_id >= 0: - return _make_managed_location("device", loc_id) - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" - ) - elif loc_type == "device": - if isinstance(location, int) and location >= 0: - loc_id = location - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" - ) - return _make_managed_location(loc_type, loc_id) - elif loc_type == "host": - if location not in (None, -1): - raise ValueError( - f"{what} location must be None or -1 when location_type is 'host', got {location!r}" - ) - return _make_managed_location(loc_type, -1) - elif loc_type == "host_numa": - if not isinstance(location, int) or location < 0: - raise ValueError( - f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" - ) - return _make_managed_location(loc_type, location) - else: - if location is not None: - raise ValueError( - f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" - ) - return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - - -cdef bint _managed_location_uses_v2_bindings(): - # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - global _V2_BINDINGS - if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0 - return _V2_BINDINGS != 0 - - -cdef object _LEGACY_LOC_DEVICE = None -cdef object _LEGACY_LOC_HOST = None - -cdef int _managed_location_to_legacy_device(object location, str what): - global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST - if _LEGACY_LOC_DEVICE is None: - _LEGACY_LOC_DEVICE = _managed_location_enum("device") - _LEGACY_LOC_HOST = _managed_location_enum("host") - cdef object loc_type = location.type - if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: - return location.id - raise RuntimeError( - f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" - ) - - cdef void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: @@ -303,16 +154,6 @@ ELSE: ) -cdef void _require_managed_discard_prefetch_support(str what): - global _DISCARD_PREFETCH_SUPPORTED - if _DISCARD_PREFETCH_SUPPORTED < 0: - _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 - if not _DISCARD_PREFETCH_SUPPORTED: - raise RuntimeError( - f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" - ) - - def discard( targets, *, @@ -385,57 +226,80 @@ cdef void _do_batch_discard(tuple bufs, Stream s): def advise( - target: Buffer, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, + targets, + advice, + location=None, *, - location_type: str | None = None, + options=None, ): - """Apply managed-memory advice to an allocation range. + """Apply managed-memory advice to one or more allocation ranges. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to advise. + advice : str | :obj:`~driver.CUmem_advise` + Managed-memory advice. String aliases (``"set_read_mostly"``, + ``"unset_read_mostly"``, ``"set_preferred_location"``, + ``"unset_preferred_location"``, ``"set_accessed_by"``, + ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). Required for advice values that consult a + location; ignored (may be ``None``) for ``set_read_mostly``, + ``unset_read_mostly``, and ``unset_preferred_location``. A sequence + must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. """ - if not isinstance(target, Buffer): - raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "advise") + if options is not None: + raise TypeError( + f"advise options must be None (reserved); got {type(options).__name__}" + ) cdef str advice_name - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size + cdef object advice_value + advice_name, advice_value = _normalize_managed_advice(advice) + cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION + cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name] - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - ptr, - nbytes, - advice, - _managed_location_to_legacy_device(location, "advise"), + cdef tuple bufs = _coerce_buffer_targets(targets, "advise") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise") + + cdef Buffer buf + cdef object loc + for buf in bufs: + _require_managed_buffer(buf, "advise") + for loc in locs: + if loc is not None and loc.kind not in allowed_kinds: + raise ValueError( + f"advise '{advice_name}' does not support location_type='{loc.kind}'" ) - ) + + cdef Py_ssize_t i + for i in range(n): + _do_single_advise(bufs[i], advice_value, locs[i], allow_none) + + +cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUmem_advise advice_enum = (int(advice_value)) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc + if loc is None: + # Driver ignores location for read_mostly / unset_preferred_location + # advice values but still validates the CUmemLocation; pass a + # host placeholder. + cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + cu_loc.id = 0 + else: + cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) + ELSE: + cdef int dev_int = -1 if loc is None else _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) def prefetch( diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 627a60bb3f..a469c63a10 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -54,7 +54,6 @@ _READ_MOSTLY_ENABLED = 1 _HOST_LOCATION_ID = -1 _INVALID_HOST_DEVICE_ORDINAL = 0 -_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1264,6 +1263,8 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): + from cuda.core.managed_memory import Location + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -1281,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - managed_memory.advise(buffer, "set_preferred_location", location_type="host") + managed_memory.advise(buffer, "set_preferred_location", Location.host()) preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, @@ -1359,30 +1360,6 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer.close() -def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - calls = [] - - def fake_cuMemAdvise(ptr, size, advice, location): - calls.append((ptr, size, advice, location)) - return (driver.CUresult.CUDA_SUCCESS,) - - monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) - monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) - - managed_memory.advise(buffer, "set_read_mostly") - - assert len(calls) == 1 - assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) - - buffer.close() - - def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() @@ -1411,14 +1388,17 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) + from cuda.core.managed_memory import Location with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" + from cuda.core.managed_memory import Location + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -1431,16 +1411,16 @@ def test_managed_memory_advise_location_validation(init_cuda): # set_preferred_location requires a location; device ordinal works managed_memory.advise(buffer, "set_preferred_location", device.device_id) - # set_preferred_location with host location_type - managed_memory.advise(buffer, "set_preferred_location", location_type="host") + # set_preferred_location with host location + managed_memory.advise(buffer, "set_preferred_location", Location.host()) # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0)) # set_accessed_by with host_numa_current also raises ValueError with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current()) # Inferred location from int: -1 maps to host, 0 maps to device managed_memory.advise(buffer, "set_preferred_location", -1) @@ -2221,3 +2201,52 @@ def test_rejects_non_managed(self, init_cuda): with pytest.raises(ValueError, match="managed-memory"): discard_prefetch(buf, Location.host(), stream=stream) buf.close() + + +class TestAdvise: + def test_batched_same_advice(self, init_cuda): + from cuda.core.managed_memory import advise, Location + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [ + DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + for _ in range(2) + ] + advise(bufs, "set_read_mostly") + for buf in bufs: + assert ( + _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.managed_memory import advise, Location + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [ + DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + for _ in range(2) + ] + advise( + bufs, + "set_preferred_location", + [Location.host(), Location.device(device.device_id)], + ) + for buf in bufs: + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import advise + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + with pytest.raises(TypeError, match="must be None"): + advise(buf, "set_read_mostly", options={}) + buf.close() From fa238696802fc762b0008a20c091e998ab7e7b2b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:10:21 -0700 Subject: [PATCH 27/31] refactor(cuda.core): use Buffer.is_managed property in managed_memory ops _require_managed_buffer was poking at Buffer._mem_attrs.is_managed directly via _init_mem_attrs(). PR #1924 added the public Buffer.is_managed property which falls back to MemoryResource.is_managed when the pointer attribute query does not advertise managed memory (the case for pool- allocated managed memory). Switch _require_managed_buffer to the public property. This also fixes a latent bug where pool-allocated managed buffers were being rejected by the managed_memory ops despite Buffer.is_managed correctly reporting True. Drops the no-longer-needed cimport of _init_mem_attrs. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 11236a1ecf..f4e13ef16e 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -8,7 +8,7 @@ from cpython.mem cimport PyMem_Free, PyMem_Malloc from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs +from cuda.core._memory._buffer cimport Buffer from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -81,8 +81,10 @@ cdef tuple _normalize_managed_advice(object advice): cdef void _require_managed_buffer(Buffer self, str what): - _init_mem_attrs(self) - if not self._mem_attrs.is_managed: + # Buffer.is_managed handles both pointer-attribute and memory-resource + # paths (e.g. pool-allocated managed memory whose pointer attribute + # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED). + if not self.is_managed: raise ValueError(f"{what} requires a managed-memory allocation") From 68bdd14357598b53dc7c0d7a2654b014d876f58f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:10:56 -0700 Subject: [PATCH 28/31] docs(cuda.core): document Location, discard, and 1..N managed_memory ops api.rst: add Location and discard to the managed_memory autosummary. 1.0.0-notes.rst: replace the placeholder bullet with a description of the unified 1..N API, the Location dataclass, and the dispatch to batched driver entry points on cuda.bindings 12.8+. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/docs/source/api.rst | 2 ++ cuda_core/docs/source/release/1.0.0-notes.rst | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index b7df6d7b96..fd0e01dedf 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -137,8 +137,10 @@ Managed memory .. autosummary:: :toctree: generated/ + Location advise prefetch + discard discard_prefetch .. module:: cuda.core diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 4008c86f5d..25e9066761 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -17,11 +17,16 @@ New features ------------ - Added managed-memory range operations under :mod:`cuda.core.managed_memory`: - ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free - functions accept either a managed :class:`Buffer` or a raw pointer plus - ``size=``, validate that the target allocation is managed memory, and then - forward to the corresponding CUDA driver operations for range advice and - migration. + :class:`~managed_memory.Location`, :func:`~managed_memory.advise`, + :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and + :func:`~managed_memory.discard_prefetch`. Each operation accepts either a + single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ + the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver + entry point, addressing the managed-memory portion of #1333. Locations + are expressed via the typed :class:`~managed_memory.Location` dataclass + (with classmethod constructors ``device``, ``host``, ``host_numa``, and + ``host_numa_current``); ``Device`` and ``int`` values are still accepted + for ergonomic compatibility. Fixes and enhancements From b4d9cbfa7270e7da9e260d457a1678f38bd2833d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:20:19 -0700 Subject: [PATCH 29/31] chore(cuda.core): drop narrative comments and tighten _coerce_location docstring Per /simplify review, remove WHAT-only comments that just restate the function signature in front of _coerce_buffer_targets and _broadcast_locations. Tighten the _coerce_location docstring to lead with the conversion intent rather than restate the type annotation. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_location.py | 5 ++--- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index e081a8da32..8d1605153f 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -52,10 +52,9 @@ def host_numa_current(cls) -> "Location": def _coerce_location(value, *, allow_none: bool = False) -> Location | None: - """Coerce user input to a Location instance. + """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``. - Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device, - -1 → host), None (only if allow_none=True). + Maps int ``-1`` to host and other non-negative ints to that device ordinal. """ from cuda.core._device import Device # avoid import cycle at module load diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index f4e13ef16e..90e5611a2d 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -88,7 +88,6 @@ cdef void _require_managed_buffer(Buffer self, str what): raise ValueError(f"{what} requires a managed-memory allocation") -# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...]. cdef tuple _coerce_buffer_targets(object targets, str what): cdef list out if isinstance(targets, Buffer): @@ -110,8 +109,6 @@ cdef tuple _coerce_buffer_targets(object targets, str what): ) -# Broadcast a single location across ``n`` targets, or coerce a length-N -# sequence elementwise. cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): cdef object coerced if isinstance(location, (list, tuple)): From ee967583b78d014723db47b9cc4b145bf9c031fa Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:41:19 -0700 Subject: [PATCH 30/31] chore(cuda.core): satisfy pre-commit hooks - ruff auto-applied: * Drop unused `_managed_memory_ops` test import (no longer needed after the legacy-bindings monkeypatch test was deleted) * Drop "Location" string-quoted forward refs in _managed_location.py (file already uses `from __future__ import annotations`) * Reformat string concatenations and add blank-line-after-import spacing - cython-lint auto-applied: * Drop unused libc.stdint cimport of `uintptr_t` * Drop unused `Location` Python import (only used in docstrings) * Drop unused `n` local in `discard()` * Move `cpython.mem cimport` of PyMem_Free / PyMem_Malloc inside the `IF CUDA_CORE_BUILD_MAJOR >= 13:` block where the symbols are actually used; cython-lint cannot see across compile-time branches. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 16 +++--- .../cuda/core/_memory/_managed_memory_ops.pyx | 7 ++- cuda_core/tests/test_memory.py | 51 +++++++++++++++---- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 8d1605153f..0e89cb92e3 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -35,19 +35,19 @@ def __post_init__(self) -> None: raise ValueError(f"{self.kind} location must have id=None") @classmethod - def device(cls, device_id: int) -> "Location": + def device(cls, device_id: int) -> Location: return cls(kind="device", id=device_id) @classmethod - def host(cls) -> "Location": + def host(cls) -> Location: return cls(kind="host", id=None) @classmethod - def host_numa(cls, numa_id: int) -> "Location": + def host_numa(cls, numa_id: int) -> Location: return cls(kind="host_numa", id=numa_id) @classmethod - def host_numa_current(cls) -> "Location": + def host_numa_current(cls) -> Location: return cls(kind="host_numa_current", id=None) @@ -71,9 +71,5 @@ def _coerce_location(value, *, allow_none: bool = False) -> Location | None: return Location.host() if value >= 0: return Location.device(value) - raise ValueError( - f"device ordinal must be >= 0 (or -1 for host), got {value}" - ) - raise TypeError( - f"location must be a Location, Device, int, or None; got {type(value).__name__}" - ) + raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}") + raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 90e5611a2d..9926cbe67f 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,8 +4,8 @@ from __future__ import annotations -from cpython.mem cimport PyMem_Free, PyMem_Malloc -from libc.stdint cimport uintptr_t +IF CUDA_CORE_BUILD_MAJOR >= 13: + from cpython.mem cimport PyMem_Free, PyMem_Malloc from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer @@ -14,7 +14,7 @@ from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver -from cuda.core._memory._managed_location import Location, _coerce_location +from cuda.core._memory._managed_location import _coerce_location cdef dict _MANAGED_ADVICE_ALIASES = { @@ -182,7 +182,6 @@ def discard( f"discard options must be None (reserved); got {type(options).__name__}" ) cdef tuple bufs = _coerce_buffer_targets(targets, "discard") - cdef Py_ssize_t n = len(bufs) cdef Stream s = Stream_accept(stream) cdef Buffer buf diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a469c63a10..36fdfd0347 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -44,7 +44,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops +from cuda.core._memory import IPCBufferDescriptor from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1389,6 +1389,7 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) @@ -1875,42 +1876,50 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): class TestLocation: def test_device_constructor(self): from cuda.core.managed_memory import Location + loc = Location.device(0) assert loc.kind == "device" assert loc.id == 0 def test_host_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host() assert loc.kind == "host" assert loc.id is None def test_host_numa_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host_numa(3) assert loc.kind == "host_numa" assert loc.id == 3 def test_host_numa_current_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host_numa_current() assert loc.kind == "host_numa_current" assert loc.id is None def test_frozen(self): import dataclasses + from cuda.core.managed_memory import Location + loc = Location.device(0) with pytest.raises(dataclasses.FrozenInstanceError): loc.id = 1 def test_invalid_device_id(self): from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="device id must be >= 0"): Location.device(-1) def test_invalid_kind(self): from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) @@ -1919,21 +1928,25 @@ class TestLocationCoerce: def test_passthrough(self): from cuda.core._memory._managed_location import _coerce_location from cuda.core.managed_memory import Location + loc = Location.device(0) assert _coerce_location(loc) is loc def test_int_device(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(0).kind == "device" assert _coerce_location(0).id == 0 def test_int_minus_one_is_host(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(-1).kind == "host" def test_device_object(self, init_cuda): from cuda.core import Device from cuda.core._memory._managed_location import _coerce_location + dev = Device() loc = _coerce_location(dev) assert loc.kind == "device" @@ -1941,20 +1954,24 @@ def test_device_object(self, init_cuda): def test_none_when_disallowed(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="location is required"): _coerce_location(None, allow_none=False) def test_none_when_allowed(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(None, allow_none=True) is None def test_bad_int(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="device ordinal"): _coerce_location(-2) def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(TypeError, match="Location, Device, int, or None"): _coerce_location("device") @@ -1962,6 +1979,7 @@ def test_bad_type(self): class TestPrefetch: def test_single_with_location_host(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -1980,6 +1998,7 @@ def test_single_with_location_host(self, init_cuda): def test_batched_same_location(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemPrefetchBatchAsync"): @@ -2002,6 +2021,7 @@ def test_batched_same_location(self, init_cuda): def test_batched_per_buffer_location(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemPrefetchBatchAsync"): @@ -2029,6 +2049,7 @@ def test_batched_per_buffer_location(self, init_cuda): def test_length_mismatch(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2043,6 +2064,7 @@ def test_length_mismatch(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2053,6 +2075,7 @@ def test_rejects_non_managed(self, init_cuda): def test_location_required(self, init_cuda): from cuda.core.managed_memory import prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2065,6 +2088,7 @@ def test_location_required(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2079,6 +2103,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscard: def test_single_buffer(self, init_cuda): from cuda.core.managed_memory import Location, discard, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardBatchAsync"): @@ -2095,6 +2120,7 @@ def test_single_buffer(self, init_cuda): def test_batched(self, init_cuda): from cuda.core.managed_memory import Location, discard, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardBatchAsync"): @@ -2112,6 +2138,7 @@ def test_batched(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import discard + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2122,6 +2149,7 @@ def test_rejects_non_managed(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import discard + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2136,6 +2164,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscardPrefetch: def test_single_buffer(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): @@ -2159,6 +2188,7 @@ def test_single_buffer(self, init_cuda): def test_batched_same_location(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): @@ -2181,6 +2211,7 @@ def test_batched_same_location(self, init_cuda): def test_length_mismatch(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2194,6 +2225,7 @@ def test_length_mismatch(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2205,14 +2237,12 @@ def test_rejects_non_managed(self, init_cuda): class TestAdvise: def test_batched_same_advice(self, init_cuda): - from cuda.core.managed_memory import advise, Location + from cuda.core.managed_memory import advise + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [ - DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - for _ in range(2) - ] + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise(bufs, "set_read_mostly") for buf in bufs: assert ( @@ -2225,14 +2255,12 @@ def test_batched_same_advice(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import advise, Location + from cuda.core.managed_memory import Location, advise + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [ - DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - for _ in range(2) - ] + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise( bufs, "set_preferred_location", @@ -2243,6 +2271,7 @@ def test_batched_per_buffer_location(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import advise + device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() From d6f60f247a8572de41a2abfc20d872898bdf71f8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 19:08:33 -0700 Subject: [PATCH 31/31] refactor(cuda.core): move managed_memory ops to cuda.core.utils Per Leo's review request (https://github.com/NVIDIA/cuda-python/pull/1775#discussion_r2991209111), fold the managed-memory free functions and the Location dataclass into cuda.core.utils rather than maintaining a dedicated cuda.core.managed_memory namespace. - Re-export Location, advise, prefetch, discard, discard_prefetch from cuda.core.utils. - Delete cuda.core.managed_memory module. - Update cuda.core.__init__ to drop the managed_memory submodule import. - Update tests to import from cuda.core.utils. - Update api.rst: drop the dedicated Managed memory section; add the managed-memory entries to the Utility functions section. - Update 1.0.0-notes.rst accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/managed_memory.py | 10 -- cuda_core/cuda/core/utils.py | 9 +- cuda_core/docs/source/api.rst | 23 +--- cuda_core/docs/source/release/1.0.0-notes.rst | 22 ++-- cuda_core/tests/test_memory.py | 108 +++++++++--------- 6 files changed, 79 insertions(+), 95 deletions(-) delete mode 100644 cuda_core/cuda/core/managed_memory.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 61315dda5a..dfd52accea 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ def _import_versioned_module(): del _import_versioned_module -from cuda.core import managed_memory, system, utils +from cuda.core import system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py deleted file mode 100644 index 509e874ccc..0000000000 --- a/cuda_core/cuda/core/managed_memory.py +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -"""Managed-memory range operations.""" - -from cuda.core._memory._managed_location import Location -from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch - -__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"] diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index f15d924277..3d4b3e4c59 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -1,7 +1,14 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +from cuda.core._memory._managed_location import Location # noqa: F401 +from cuda.core._memory._managed_memory_ops import ( + advise, # noqa: F401 + discard, # noqa: F401 + discard_prefetch, # noqa: F401 + prefetch, # noqa: F401 +) from cuda.core._memoryview import ( StridedMemoryView, # noqa: F401 args_viewable_as_strided_memory, # noqa: F401 diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fd0e01dedf..fa17624fa5 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -129,24 +129,6 @@ Each subclass exposes attributes unique to its operation type. graph.SwitchNode -.. module:: cuda.core.managed_memory - -Managed memory --------------- - -.. autosummary:: - :toctree: generated/ - - Location - advise - prefetch - discard - discard_prefetch - -.. module:: cuda.core - :no-index: - - Graphics interoperability ------------------------- @@ -265,7 +247,12 @@ Utility functions :toctree: generated/ args_viewable_as_strided_memory + advise + prefetch + discard + discard_prefetch :template: autosummary/cyclass.rst + Location StridedMemoryView diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 25e9066761..17696b616a 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -16,17 +16,17 @@ Highlights New features ------------ -- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: - :class:`~managed_memory.Location`, :func:`~managed_memory.advise`, - :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and - :func:`~managed_memory.discard_prefetch`. Each operation accepts either a - single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ - the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver - entry point, addressing the managed-memory portion of #1333. Locations - are expressed via the typed :class:`~managed_memory.Location` dataclass - (with classmethod constructors ``device``, ``host``, ``host_numa``, and - ``host_numa_current``); ``Device`` and ``int`` values are still accepted - for ergonomic compatibility. +- Added managed-memory range operations to :mod:`cuda.core.utils`: + :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`, + :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each + operation accepts either a single managed :class:`Buffer` or a + sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the + corresponding ``cuMem*BatchAsync`` driver entry point, addressing the + managed-memory portion of #1333. Locations are expressed via the typed + :class:`~utils.Location` dataclass (with classmethod constructors + ``device``, ``host``, ``host_numa``, and ``host_numa_current``); + ``Device`` and ``int`` values are still accepted for ergonomic + compatibility. Fixes and enhancements diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 36fdfd0347..18f7bed114 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,7 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, - managed_memory, + utils, ) from cuda.core import ( system as ccx_system, @@ -1243,7 +1243,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -1251,7 +1251,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): ) assert last_location == _HOST_LOCATION_ID - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -1263,7 +1263,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -1271,7 +1271,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") assert ( _get_int_mem_range_attr( buffer, @@ -1282,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - managed_memory.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Location.host()) preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, @@ -1300,7 +1300,7 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1322,10 +1322,10 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_ buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1345,10 +1345,10 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1368,11 +1368,11 @@ def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) buffer.close() @@ -1387,18 +1387,18 @@ def test_managed_memory_operation_validation(init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="location is required"): - managed_memory.prefetch(buffer, stream=stream) - from cuda.core.managed_memory import Location + utils.prefetch(buffer, stream=stream) + from cuda.core.utils import Location with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) + utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" - from cuda.core.managed_memory import Location + from cuda.core.utils import Location device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -1407,25 +1407,25 @@ def test_managed_memory_advise_location_validation(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) # set_read_mostly works without a location (location is ignored) - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") # set_preferred_location requires a location; device ordinal works - managed_memory.advise(buffer, "set_preferred_location", device.device_id) + utils.advise(buffer, "set_preferred_location", device.device_id) # set_preferred_location with host location - managed_memory.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Location.host()) # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0)) + utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) # set_accessed_by with host_numa_current also raises ValueError with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current()) + utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) # Inferred location from int: -1 maps to host, 0 maps to device - managed_memory.advise(buffer, "set_preferred_location", -1) - managed_memory.advise(buffer, "set_preferred_location", 0) + utils.advise(buffer, "set_preferred_location", -1) + utils.advise(buffer, "set_preferred_location", 0) buffer.close() @@ -1439,7 +1439,7 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY - managed_memory.advise(buffer, advice_enum) + utils.advise(buffer, advice_enum) assert ( _get_int_mem_range_attr( @@ -1461,10 +1461,10 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) with pytest.raises(ValueError, match="advice must be one of"): - managed_memory.advise(buffer, "not_a_real_advice") + utils.advise(buffer, "not_a_real_advice") with pytest.raises(TypeError, match="advice must be"): - managed_memory.advise(buffer, 42) + utils.advise(buffer, 42) buffer.close() @@ -1875,28 +1875,28 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): class TestLocation: def test_device_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) assert loc.kind == "device" assert loc.id == 0 def test_host_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host() assert loc.kind == "host" assert loc.id is None def test_host_numa_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host_numa(3) assert loc.kind == "host_numa" assert loc.id == 3 def test_host_numa_current_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host_numa_current() assert loc.kind == "host_numa_current" @@ -1905,20 +1905,20 @@ def test_host_numa_current_constructor(self): def test_frozen(self): import dataclasses - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) with pytest.raises(dataclasses.FrozenInstanceError): loc.id = 1 def test_invalid_device_id(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location with pytest.raises(ValueError, match="device id must be >= 0"): Location.device(-1) def test_invalid_kind(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) @@ -1927,7 +1927,7 @@ def test_invalid_kind(self): class TestLocationCoerce: def test_passthrough(self): from cuda.core._memory._managed_location import _coerce_location - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) assert _coerce_location(loc) is loc @@ -1978,7 +1978,7 @@ def test_bad_type(self): class TestPrefetch: def test_single_with_location_host(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -1997,7 +1997,7 @@ def test_single_with_location_host(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2020,7 +2020,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2048,7 +2048,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2063,7 +2063,7 @@ def test_length_mismatch(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() device.set_current() @@ -2074,7 +2074,7 @@ def test_rejects_non_managed(self, init_cuda): buf.close() def test_location_required(self, init_cuda): - from cuda.core.managed_memory import prefetch + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2087,7 +2087,7 @@ def test_location_required(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2102,7 +2102,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscard: def test_single_buffer(self, init_cuda): - from cuda.core.managed_memory import Location, discard, prefetch + from cuda.core.utils import Location, discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2119,7 +2119,7 @@ def test_single_buffer(self, init_cuda): buf.close() def test_batched(self, init_cuda): - from cuda.core.managed_memory import Location, discard, prefetch + from cuda.core.utils import Location, discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2137,7 +2137,7 @@ def test_batched(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import discard + from cuda.core.utils import discard device = Device() device.set_current() @@ -2148,7 +2148,7 @@ def test_rejects_non_managed(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import discard + from cuda.core.utils import discard device = Device() skip_if_managed_memory_unsupported(device) @@ -2163,7 +2163,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscardPrefetch: def test_single_buffer(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch, prefetch + from cuda.core.utils import Location, discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2187,7 +2187,7 @@ def test_single_buffer(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch, prefetch + from cuda.core.utils import Location, discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2210,7 +2210,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch + from cuda.core.utils import Location, discard_prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2224,7 +2224,7 @@ def test_length_mismatch(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch + from cuda.core.utils import Location, discard_prefetch device = Device() device.set_current() @@ -2237,7 +2237,7 @@ def test_rejects_non_managed(self, init_cuda): class TestAdvise: def test_batched_same_advice(self, init_cuda): - from cuda.core.managed_memory import advise + from cuda.core.utils import advise device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -2255,7 +2255,7 @@ def test_batched_same_advice(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import Location, advise + from cuda.core.utils import Location, advise device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -2270,7 +2270,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import advise + from cuda.core.utils import advise device = Device() _skip_if_managed_allocation_unsupported(device)