diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 04b5707e18e..9065da77eb8 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle( MemoryResource mr, object ipc_descriptor = * ) + +# Memory attribute query helpers (used by _managed_memory_ops) +cdef void _init_mem_attrs(Buffer self) +cdef int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr, +) except -1 nogil diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index bb6fd97df6f..4ca8650e8db 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -71,6 +71,7 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`. """ + cdef class Buffer: """Represent a handle to allocated memory. @@ -455,12 +456,15 @@ cdef inline int _query_memory_attrs( ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) HANDLE_RETURN(ret) + # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the + # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet. + out.is_managed = is_managed != 0 + if memory_type == 0: # unregistered host pointer out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 - out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -469,12 +473,10 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id - out.is_managed = is_managed elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id - out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py new file mode 100644 index 00000000000..0e89cb92e37 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current") +LocationKind = Literal["device", "host", "host_numa", "host_numa_current"] + + +@dataclass(frozen=True) +class Location: + """Typed managed-memory location. + + Use the classmethod constructors (``device``, ``host``, ``host_numa``, + ``host_numa_current``) rather than constructing directly. + """ + + kind: LocationKind + id: int | None = None + + def __post_init__(self) -> None: + if self.kind not in _VALID_KINDS: + raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}") + if self.kind == "device": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("device id must be >= 0") + elif self.kind == "host_numa": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("host_numa id must be >= 0") + elif self.kind in ("host", "host_numa_current"): + if self.id is not None: + raise ValueError(f"{self.kind} location must have id=None") + + @classmethod + def device(cls, device_id: int) -> Location: + return cls(kind="device", id=device_id) + + @classmethod + def host(cls) -> Location: + return cls(kind="host", id=None) + + @classmethod + def host_numa(cls, numa_id: int) -> Location: + return cls(kind="host_numa", id=numa_id) + + @classmethod + def host_numa_current(cls) -> Location: + return cls(kind="host_numa_current", id=None) + + +def _coerce_location(value, *, allow_none: bool = False) -> Location | None: + """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``. + + Maps int ``-1`` to host and other non-negative ints to that device ordinal. + """ + from cuda.core._device import Device # avoid import cycle at module load + + if isinstance(value, Location): + return value + if isinstance(value, Device): + return Location.device(value.device_id) + if value is None: + if allow_none: + return None + raise ValueError("location is required") + if isinstance(value, int): + if value == -1: + return Location.host() + if value >= 0: + return Location.device(value) + raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}") + raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx new file mode 100644 index 00000000000..9926cbe67f8 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -0,0 +1,492 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +IF CUDA_CORE_BUILD_MAJOR >= 13: + from cpython.mem cimport PyMem_Free, PyMem_Malloc + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer +from cuda.core._resource_handles cimport as_cu +from cuda.core._stream cimport Stream, Stream_accept +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +from cuda.core._utils.cuda_utils import driver +from cuda.core._memory._managed_location import _coerce_location + + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} + +# Lazily cached: maps driver.CUmem_advise enum value → string alias. +cdef dict _ADVICE_ENUM_TO_ALIAS = None + + +cdef tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef void _require_managed_buffer(Buffer self, str what): + # Buffer.is_managed handles both pointer-attribute and memory-resource + # paths (e.g. pool-allocated managed memory whose pointer attribute + # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED). + if not self.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + + +cdef tuple _coerce_buffer_targets(object targets, str what): + cdef list out + if isinstance(targets, Buffer): + return (targets,) + if isinstance(targets, (list, tuple)): + if not targets: + raise ValueError(f"{what}: empty targets sequence") + out = [] + for t in targets: + if not isinstance(t, Buffer): + raise TypeError( + f"{what}: each target must be a Buffer, got {type(t).__name__}" + ) + out.append(t) + return tuple(out) + raise TypeError( + f"{what}: targets must be a Buffer or sequence of Buffer, " + f"got {type(targets).__name__}" + ) + + +cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): + cdef object coerced + if isinstance(location, (list, tuple)): + if len(location) != n: + raise ValueError( + f"{what}: location length {len(location)} does not match " + f"targets length {n}" + ) + return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) + coerced = _coerce_location(location, allow_none=allow_none) + return tuple([coerced] * n) + + +IF CUDA_CORE_BUILD_MAJOR >= 13: + # Convert a Location dataclass to a cydriver.CUmemLocation struct. + cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): + cdef cydriver.CUmemLocation out + cdef str kind = loc.kind + if kind == "device": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + out.id = loc.id + elif kind == "host": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + out.id = 0 + elif kind == "host_numa": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA + out.id = loc.id + else: # host_numa_current + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT + out.id = 0 + return out +ELSE: + # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host). + cdef inline int _to_legacy_device(object loc) except? -2: + cdef str kind = loc.kind + if kind == "device": + return loc.id + if kind == "host": + return -1 + raise RuntimeError( + f"location_type={kind!r} requires a CUDA 13 build of cuda.core" + ) + + +def discard( + targets, + *, + options=None, + stream, +): + """Discard one or more managed-memory ranges. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard. Their resident pages + are released without prefetching new contents; subsequent access + is satisfied by lazy migration. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous discard (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+. + """ + if options is not None: + raise TypeError( + f"discard options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard") + cdef Stream s = Stream_accept(stream) + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard") + + _do_batch_discard(bufs, s) + + +cdef void _do_batch_discard(tuple bufs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes): + PyMem_Free(ptrs) + PyMem_Free(sizes) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( + ptrs, sizes, n, 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + ELSE: + raise NotImplementedError( + "discard requires a CUDA 13 build of cuda.core" + ) + + +def advise( + targets, + advice, + location=None, + *, + options=None, +): + """Apply managed-memory advice to one or more allocation ranges. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to advise. + advice : str | :obj:`~driver.CUmem_advise` + Managed-memory advice. String aliases (``"set_read_mostly"``, + ``"unset_read_mostly"``, ``"set_preferred_location"``, + ``"unset_preferred_location"``, ``"set_accessed_by"``, + ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). Required for advice values that consult a + location; ignored (may be ``None``) for ``set_read_mostly``, + ``unset_read_mostly``, and ``unset_preferred_location``. A sequence + must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. + """ + if options is not None: + raise TypeError( + f"advise options must be None (reserved); got {type(options).__name__}" + ) + cdef str advice_name + cdef object advice_value + advice_name, advice_value = _normalize_managed_advice(advice) + cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION + cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name] + + cdef tuple bufs = _coerce_buffer_targets(targets, "advise") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise") + + cdef Buffer buf + cdef object loc + for buf in bufs: + _require_managed_buffer(buf, "advise") + for loc in locs: + if loc is not None and loc.kind not in allowed_kinds: + raise ValueError( + f"advise '{advice_name}' does not support location_type='{loc.kind}'" + ) + + cdef Py_ssize_t i + for i in range(n): + _do_single_advise(bufs[i], advice_value, locs[i], allow_none) + + +cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUmem_advise advice_enum = (int(advice_value)) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc + if loc is None: + # Driver ignores location for read_mostly / unset_preferred_location + # advice values but still validates the CUmemLocation; pass a + # host placeholder. + cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + cu_loc.id = 0 + else: + cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) + ELSE: + cdef int dev_int = -1 if loc is None else _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) + + +def prefetch( + targets, + location=None, + *, + options=None, + stream, +): + """Prefetch one or more managed-memory ranges to a target location. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to operate on. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; a + sequence must match ``len(targets)``. ``Device`` and ``int`` values + are coerced to :class:`Location` (``-1`` maps to host). + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous prefetch (keyword-only). + + Raises + ------ + NotImplementedError + If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``. + """ + if options is not None: + raise TypeError( + f"prefetch options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "prefetch") + cdef Stream s = Stream_accept(stream) + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "prefetch") + + if n == 1: + _do_single_prefetch(bufs[0], locs[0], s) + else: + _do_batch_prefetch(bufs, locs, s) + + +cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) + ELSE: + cdef int dev_int = _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) + + +cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "batched prefetch requires a CUDA 13 build of cuda.core" + ) + + +def discard_prefetch( + targets, + location=None, + *, + options=None, + stream, +): + """Discard one or more managed-memory ranges and prefetch them to a target location. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard and re-prefetch. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; + a sequence must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous operation (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch + requires CUDA 13+. + """ + if options is not None: + raise TypeError( + f"discard_prefetch options must be None (reserved); " + f"got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch") + cdef Stream s = Stream_accept(stream) + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard_prefetch") + + _do_batch_discard_prefetch(bufs, locs, s) + + +cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "discard_prefetch requires a CUDA 13 build of cuda.core" + ) diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index f15d9242778..3d4b3e4c596 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -1,7 +1,14 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +from cuda.core._memory._managed_location import Location # noqa: F401 +from cuda.core._memory._managed_memory_ops import ( + advise, # noqa: F401 + discard, # noqa: F401 + discard_prefetch, # noqa: F401 + prefetch, # noqa: F401 +) from cuda.core._memoryview import ( StridedMemoryView, # noqa: F401 args_viewable_as_strided_memory, # noqa: F401 diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 88780732d54..fa17624fa5e 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -247,7 +247,12 @@ Utility functions :toctree: generated/ args_viewable_as_strided_memory + advise + prefetch + discard + discard_prefetch :template: autosummary/cyclass.rst + Location StridedMemoryView diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 34eff571005..17696b616a1 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -16,7 +16,17 @@ Highlights New features ------------ -- TBD +- Added managed-memory range operations to :mod:`cuda.core.utils`: + :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`, + :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each + operation accepts either a single managed :class:`Buffer` or a + sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the + corresponding ``cuMem*BatchAsync`` driver entry point, addressing the + managed-memory portion of #1333. Locations are expressed via the typed + :class:`~utils.Location` dataclass (with classmethod constructors + ``device``, ``host``, ``host_numa``, and ``host_numa_current``); + ``Device`` and ``int`` values are still accepted for ergonomic + compatibility. Fixes and enhancements diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 57de22bb9a0..18f7bed1141 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,6 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, + utils, ) from cuda.core import ( system as ccx_system, @@ -48,6 +49,11 @@ from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 class DummyDeviceMemoryResource(MemoryResource): @@ -1192,6 +1198,277 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == _HOST_LOCATION_ID + + utils.prefetch(buffer, device, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): + from cuda.core.utils import Location + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + utils.advise(buffer, "set_read_mostly") + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + utils.advise(buffer, "set_preferred_location", Location.host()) + preferred_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, + ) + assert preferred_location == _HOST_LOCATION_ID + + buffer.close() + + +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + utils.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + utils.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.discard_prefetch(buffer, device, stream=stream) + + buffer.close() + + +def test_managed_memory_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="location is required"): + utils.prefetch(buffer, stream=stream) + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) + + buffer.close() + + +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + from cuda.core.utils import Location + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + utils.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + utils.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location + utils.advise(buffer, "set_preferred_location", Location.host()) + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) + + # Inferred location from int: -1 maps to host, 0 maps to device + utils.advise(buffer, "set_preferred_location", -1) + utils.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + utils.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + utils.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + utils.advise(buffer, 42) + + buffer.close() + + def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch @@ -1594,3 +1871,411 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): assert buffer.handle >= 0 assert buffer.size == 0 assert buffer.device_id == mr.device_id + + +class TestLocation: + def test_device_constructor(self): + from cuda.core.utils import Location + + loc = Location.device(0) + assert loc.kind == "device" + assert loc.id == 0 + + def test_host_constructor(self): + from cuda.core.utils import Location + + loc = Location.host() + assert loc.kind == "host" + assert loc.id is None + + def test_host_numa_constructor(self): + from cuda.core.utils import Location + + loc = Location.host_numa(3) + assert loc.kind == "host_numa" + assert loc.id == 3 + + def test_host_numa_current_constructor(self): + from cuda.core.utils import Location + + loc = Location.host_numa_current() + assert loc.kind == "host_numa_current" + assert loc.id is None + + def test_frozen(self): + import dataclasses + + from cuda.core.utils import Location + + loc = Location.device(0) + with pytest.raises(dataclasses.FrozenInstanceError): + loc.id = 1 + + def test_invalid_device_id(self): + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="device id must be >= 0"): + Location.device(-1) + + def test_invalid_kind(self): + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="kind must be one of"): + Location(kind="not_a_kind", id=None) + + +class TestLocationCoerce: + def test_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location + from cuda.core.utils import Location + + loc = Location.device(0) + assert _coerce_location(loc) is loc + + def test_int_device(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(0).kind == "device" + assert _coerce_location(0).id == 0 + + def test_int_minus_one_is_host(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(-1).kind == "host" + + def test_device_object(self, init_cuda): + from cuda.core import Device + from cuda.core._memory._managed_location import _coerce_location + + dev = Device() + loc = _coerce_location(dev) + assert loc.kind == "device" + assert loc.id == dev.device_id + + def test_none_when_disallowed(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(ValueError, match="location is required"): + _coerce_location(None, allow_none=False) + + def test_none_when_allowed(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(None, allow_none=True) is None + + def test_bad_int(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(ValueError, match="device ordinal"): + _coerce_location(-2) + + def test_bad_type(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(TypeError, match="Location, Device, int, or None"): + _coerce_location("device") + + +class TestPrefetch: + def test_single_with_location_host(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == _HOST_LOCATION_ID + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) + stream.sync() + + last0 = _get_int_mem_range_attr( + bufs[0], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + last1 = _get_int_mem_range_attr( + bufs[1], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last0 == _HOST_LOCATION_ID + assert last1 == device.device_id + for buf in bufs: + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + with pytest.raises(ValueError, match="length"): + prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + prefetch(buf, Location.host(), stream=stream) + buf.close() + + def test_location_required(self, init_cuda): + from cuda.core.utils import prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="location is required"): + prefetch(buf, None, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + prefetch(buf, Location.host(), options={}, stream=stream) + buf.close() + + +class TestDiscard: + def test_single_buffer(self, init_cuda): + from cuda.core.utils import Location, discard, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + discard(buf, stream=stream) + stream.sync() + buf.close() + + def test_batched(self, init_cuda): + from cuda.core.utils import Location, discard, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + discard(bufs, stream=stream) + stream.sync() + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import discard + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard(buf, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import discard + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + discard(buf, options={}, stream=stream) + buf.close() + + +class TestDiscardPrefetch: + def test_single_buffer(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + discard_prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + prefetch(bufs, Location.host(), stream=stream) + stream.sync() + discard_prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + with pytest.raises(ValueError, match="length"): + discard_prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard_prefetch(buf, Location.host(), stream=stream) + buf.close() + + +class TestAdvise: + def test_batched_same_advice(self, init_cuda): + from cuda.core.utils import advise + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + advise(bufs, "set_read_mostly") + for buf in bufs: + assert ( + _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.utils import Location, advise + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + advise( + bufs, + "set_preferred_location", + [Location.host(), Location.device(device.device_id)], + ) + for buf in bufs: + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import advise + + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + with pytest.raises(TypeError, match="must be None"): + advise(buf, "set_read_mostly", options={}) + buf.close()