diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index 04b5707e18e..9065da77eb8 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport DevicePtrHandle
 from cuda.core._stream cimport Stream
 
@@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle(
     MemoryResource mr,
     object ipc_descriptor = *
 )
+
+# Memory attribute query helpers (used by _managed_memory_ops)
+cdef void _init_mem_attrs(Buffer self)
+cdef int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr,
+) except -1 nogil
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index bb6fd97df6f..4ca8650e8db 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -71,6 +71,7 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
 :attr:`Buffer.handle`.
 """
 
+
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -455,12 +456,15 @@ cdef inline int _query_memory_attrs(
         ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
     HANDLE_RETURN(ret)
 
+    # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the
+    # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet.
+    out.is_managed = is_managed != 0
+
     if memory_type == 0:
         # unregistered host pointer
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
-        out.is_managed = False
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -469,12 +473,10 @@ cdef inline int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = is_managed
     elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
         out.is_host_accessible = False
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = False
     else:
         with cython.gil:
             raise ValueError(f"Unsupported memory type: {memory_type}")
diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py
new file mode 100644
index 00000000000..0e89cb92e37
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_location.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current")
+LocationKind = Literal["device", "host", "host_numa", "host_numa_current"]
+
+
+@dataclass(frozen=True)
+class Location:
+    """Typed managed-memory location.
+
+    Use the classmethod constructors (``device``, ``host``, ``host_numa``,
+    ``host_numa_current``) rather than constructing directly.
+    """
+
+    kind: LocationKind
+    id: int | None = None
+
+    def __post_init__(self) -> None:
+        if self.kind not in _VALID_KINDS:
+            raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}")
+        if self.kind == "device":
+            if not isinstance(self.id, int) or self.id < 0:
+                raise ValueError("device id must be >= 0")
+        elif self.kind == "host_numa":
+            if not isinstance(self.id, int) or self.id < 0:
+                raise ValueError("host_numa id must be >= 0")
+        elif self.kind in ("host", "host_numa_current"):
+            if self.id is not None:
+                raise ValueError(f"{self.kind} location must have id=None")
+
+    @classmethod
+    def device(cls, device_id: int) -> Location:
+        return cls(kind="device", id=device_id)
+
+    @classmethod
+    def host(cls) -> Location:
+        return cls(kind="host", id=None)
+
+    @classmethod
+    def host_numa(cls, numa_id: int) -> Location:
+        return cls(kind="host_numa", id=numa_id)
+
+    @classmethod
+    def host_numa_current(cls) -> Location:
+        return cls(kind="host_numa_current", id=None)
+
+
+def _coerce_location(value, *, allow_none: bool = False) -> Location | None:
+    """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``.
+
+    Maps int ``-1`` to host and other non-negative ints to that device ordinal.
+    """
+    from cuda.core._device import Device  # avoid import cycle at module load
+
+    if isinstance(value, Location):
+        return value
+    if isinstance(value, Device):
+        return Location.device(value.device_id)
+    if value is None:
+        if allow_none:
+            return None
+        raise ValueError("location is required")
+    if isinstance(value, int):
+        if value == -1:
+            return Location.host()
+        if value >= 0:
+            return Location.device(value)
+        raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}")
+    raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}")
diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
new file mode 100644
index 00000000000..9926cbe67f8
--- /dev/null
+++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx
@@ -0,0 +1,492 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    from cpython.mem cimport PyMem_Free, PyMem_Malloc
+
+from cuda.bindings cimport cydriver
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._resource_handles cimport as_cu
+from cuda.core._stream cimport Stream, Stream_accept
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+from cuda.core._utils.cuda_utils import driver
+from cuda.core._memory._managed_location import _coerce_location
+
+
+cdef dict _MANAGED_ADVICE_ALIASES = {
+    "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY",
+    "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+    "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+    "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+    "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY",
+    "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+}
+
+cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset((
+    "set_read_mostly",
+    "unset_read_mostly",
+    "unset_preferred_location",
+))
+
+cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current"))
+cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa"))
+cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host"))
+
+cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = {
+    "set_read_mostly": _DEVICE_HOST_NUMA,
+    "unset_read_mostly": _DEVICE_HOST_NUMA,
+    "set_preferred_location": _ALL_LOCATION_TYPES,
+    "unset_preferred_location": _DEVICE_HOST_NUMA,
+    "set_accessed_by": _DEVICE_HOST_ONLY,
+    "unset_accessed_by": _DEVICE_HOST_ONLY,
+}
+
+# Lazily cached: maps driver.CUmem_advise enum value → string alias.
+cdef dict _ADVICE_ENUM_TO_ALIAS = None
+
+
+cdef tuple _normalize_managed_advice(object advice):
+    cdef str alias
+    cdef str attr_name
+    if isinstance(advice, str):
+        alias = advice.lower()
+        attr_name = _MANAGED_ADVICE_ALIASES.get(alias)
+        if attr_name is None:
+            raise ValueError(
+                "advice must be one of "
+                f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}"
+            )
+        return alias, getattr(driver.CUmem_advise, attr_name)
+
+    if isinstance(advice, driver.CUmem_advise):
+        global _ADVICE_ENUM_TO_ALIAS
+        if _ADVICE_ENUM_TO_ALIAS is None:
+            _ADVICE_ENUM_TO_ALIAS = {}
+            for alias, attr_name in _MANAGED_ADVICE_ALIASES.items():
+                enum_val = getattr(driver.CUmem_advise, attr_name, None)
+                if enum_val is not None:
+                    _ADVICE_ENUM_TO_ALIAS[enum_val] = alias
+        alias = _ADVICE_ENUM_TO_ALIAS.get(advice)
+        if alias is None:
+            raise ValueError(f"Unsupported advice value: {advice!r}")
+        return alias, advice
+
+    raise TypeError(
+        "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias"
+    )
+
+
+cdef void _require_managed_buffer(Buffer self, str what):
+    # Buffer.is_managed handles both pointer-attribute and memory-resource
+    # paths (e.g. pool-allocated managed memory whose pointer attribute
+    # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED).
+    if not self.is_managed:
+        raise ValueError(f"{what} requires a managed-memory allocation")
+
+
+cdef tuple _coerce_buffer_targets(object targets, str what):
+    cdef list out
+    if isinstance(targets, Buffer):
+        return (<Buffer>targets,)
+    if isinstance(targets, (list, tuple)):
+        if not targets:
+            raise ValueError(f"{what}: empty targets sequence")
+        out = []
+        for t in targets:
+            if not isinstance(t, Buffer):
+                raise TypeError(
+                    f"{what}: each target must be a Buffer, got {type(t).__name__}"
+                )
+            out.append(t)
+        return tuple(out)
+    raise TypeError(
+        f"{what}: targets must be a Buffer or sequence of Buffer, "
+        f"got {type(targets).__name__}"
+    )
+
+
+cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what):
+    cdef object coerced
+    if isinstance(location, (list, tuple)):
+        if len(location) != n:
+            raise ValueError(
+                f"{what}: location length {len(location)} does not match "
+                f"targets length {n}"
+            )
+        return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location)
+    coerced = _coerce_location(location, allow_none=allow_none)
+    return tuple([coerced] * n)
+
+
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    # Convert a Location dataclass to a cydriver.CUmemLocation struct.
+    cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc):
+        cdef cydriver.CUmemLocation out
+        cdef str kind = loc.kind
+        if kind == "device":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            out.id = <int>loc.id
+        elif kind == "host":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+            out.id = 0
+        elif kind == "host_numa":
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
+            out.id = <int>loc.id
+        else:  # host_numa_current
+            out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
+            out.id = 0
+        return out
+ELSE:
+    # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host).
+    cdef inline int _to_legacy_device(object loc) except? -2:
+        cdef str kind = loc.kind
+        if kind == "device":
+            return <int>loc.id
+        if kind == "host":
+            return -1
+        raise RuntimeError(
+            f"location_type={kind!r} requires a CUDA 13 build of cuda.core"
+        )
+
+
+def discard(
+    targets,
+    *,
+    options=None,
+    stream,
+):
+    """Discard one or more managed-memory ranges.
+
+    Parameters
+    ----------
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to discard. Their resident pages
+        are released without prefetching new contents; subsequent access
+        is satisfied by lazy migration.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous discard (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+.
+    """
+    if options is not None:
+        raise TypeError(
+            f"discard options must be None (reserved); got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "discard")
+    cdef Stream s = Stream_accept(stream)
+
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "discard")
+
+    _do_batch_discard(bufs, s)
+
+
+cdef void _do_batch_discard(tuple bufs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync(
+                    ptrs, sizes, <size_t>n, 0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+    ELSE:
+        raise NotImplementedError(
+            "discard requires a CUDA 13 build of cuda.core"
+        )
+
+
+def advise(
+    targets,
+    advice,
+    location=None,
+    *,
+    options=None,
+):
+    """Apply managed-memory advice to one or more allocation ranges.
+
+    Parameters
+    ----------
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to advise.
+    advice : str | :obj:`~driver.CUmem_advise`
+        Managed-memory advice. String aliases (``"set_read_mostly"``,
+        ``"unset_read_mostly"``, ``"set_preferred_location"``,
+        ``"unset_preferred_location"``, ``"set_accessed_by"``,
+        ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). Required for advice values that consult a
+        location; ignored (may be ``None``) for ``set_read_mostly``,
+        ``unset_read_mostly``, and ``unset_preferred_location``. A sequence
+        must match ``len(targets)``.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    """
+    if options is not None:
+        raise TypeError(
+            f"advise options must be None (reserved); got {type(options).__name__}"
+        )
+    cdef str advice_name
+    cdef object advice_value
+    advice_name, advice_value = _normalize_managed_advice(advice)
+    cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION
+    cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name]
+
+    cdef tuple bufs = _coerce_buffer_targets(targets, "advise")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise")
+
+    cdef Buffer buf
+    cdef object loc
+    for buf in bufs:
+        _require_managed_buffer(buf, "advise")
+    for loc in locs:
+        if loc is not None and loc.kind not in allowed_kinds:
+            raise ValueError(
+                f"advise '{advice_name}' does not support location_type='{loc.kind}'"
+            )
+
+    cdef Py_ssize_t i
+    for i in range(n):
+        _do_single_advise(<Buffer>bufs[i], advice_value, locs[i], allow_none)
+
+
+cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none):
+    cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
+    cdef size_t nbytes = buf._size
+    cdef cydriver.CUmem_advise advice_enum = <cydriver.CUmem_advise>(<int>int(advice_value))
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef cydriver.CUmemLocation cu_loc
+        if loc is None:
+            # Driver ignores location for read_mostly / unset_preferred_location
+            # advice values but still validates the CUmemLocation; pass a
+            # host placeholder.
+            cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+            cu_loc.id = 0
+        else:
+            cu_loc = _to_cumemlocation(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc))
+    ELSE:
+        cdef int dev_int = -1 if loc is None else _to_legacy_device(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int))
+
+
+def prefetch(
+    targets,
+    location=None,
+    *,
+    options=None,
+    stream,
+):
+    """Prefetch one or more managed-memory ranges to a target location.
+
+    Parameters
+    ----------
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to operate on.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). A single location applies to all targets; a
+        sequence must match ``len(targets)``. ``Device`` and ``int`` values
+        are coerced to :class:`Location` (``-1`` maps to host).
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous prefetch (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``.
+    """
+    if options is not None:
+        raise TypeError(
+            f"prefetch options must be None (reserved); got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, False, "prefetch")
+    cdef Stream s = Stream_accept(stream)
+
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "prefetch")
+
+    if n == 1:
+        _do_single_prefetch(<Buffer>bufs[0], locs[0], s)
+    else:
+        _do_batch_prefetch(bufs, locs, s)
+
+
+cdef void _do_single_prefetch(Buffer buf, object loc, Stream s):
+    cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr)
+    cdef size_t nbytes = buf._size
+    cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream))
+    ELSE:
+        cdef int dev_int = _to_legacy_device(loc)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream))
+
+
+cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        cdef cydriver.CUmemLocation* loc_arr = <cydriver.CUmemLocation*>PyMem_Malloc(
+            n * sizeof(cydriver.CUmemLocation)
+        )
+        cdef size_t* loc_indices = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes and loc_arr and loc_indices):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+                loc_arr[i] = _to_cumemlocation(locs[i])
+                loc_indices[i] = <size_t>i
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync(
+                    ptrs, sizes, <size_t>n,
+                    loc_arr, loc_indices, <size_t>n,
+                    0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+    ELSE:
+        raise NotImplementedError(
+            "batched prefetch requires a CUDA 13 build of cuda.core"
+        )
+
+
+def discard_prefetch(
+    targets,
+    location=None,
+    *,
+    options=None,
+    stream,
+):
+    """Discard one or more managed-memory ranges and prefetch them to a target location.
+
+    Parameters
+    ----------
+    targets : :class:`Buffer` | Sequence[:class:`Buffer`]
+        One or more managed allocations to discard and re-prefetch.
+    location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...]
+        Target location(s). A single location applies to all targets;
+        a sequence must match ``len(targets)``.
+    options : None
+        Reserved for future per-call flags. Must be ``None``.
+    stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder`
+        Stream for the asynchronous operation (keyword-only).
+
+    Raises
+    ------
+    NotImplementedError
+        On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch
+        requires CUDA 13+.
+    """
+    if options is not None:
+        raise TypeError(
+            f"discard_prefetch options must be None (reserved); "
+            f"got {type(options).__name__}"
+        )
+    cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch")
+    cdef Py_ssize_t n = len(bufs)
+    cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch")
+    cdef Stream s = Stream_accept(stream)
+
+    cdef Buffer buf
+    for buf in bufs:
+        _require_managed_buffer(buf, "discard_prefetch")
+
+    _do_batch_discard_prefetch(bufs, locs, s)
+
+
+cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s):
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        cdef Py_ssize_t n = len(bufs)
+        cdef cydriver.CUstream hstream = as_cu(s._h_stream)
+        cdef cydriver.CUdeviceptr* ptrs = <cydriver.CUdeviceptr*>PyMem_Malloc(
+            n * sizeof(cydriver.CUdeviceptr)
+        )
+        cdef size_t* sizes = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        cdef cydriver.CUmemLocation* loc_arr = <cydriver.CUmemLocation*>PyMem_Malloc(
+            n * sizeof(cydriver.CUmemLocation)
+        )
+        cdef size_t* loc_indices = <size_t*>PyMem_Malloc(n * sizeof(size_t))
+        if not (ptrs and sizes and loc_arr and loc_indices):
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+            raise MemoryError()
+        cdef Buffer buf
+        cdef Py_ssize_t i
+        try:
+            for i in range(n):
+                buf = <Buffer>bufs[i]
+                ptrs[i] = as_cu(buf._h_ptr)
+                sizes[i] = buf._size
+                loc_arr[i] = _to_cumemlocation(locs[i])
+                loc_indices[i] = <size_t>i
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync(
+                    ptrs, sizes, <size_t>n,
+                    loc_arr, loc_indices, <size_t>n,
+                    0, hstream,
+                ))
+        finally:
+            PyMem_Free(ptrs)
+            PyMem_Free(sizes)
+            PyMem_Free(loc_arr)
+            PyMem_Free(loc_indices)
+    ELSE:
+        raise NotImplementedError(
+            "discard_prefetch requires a CUDA 13 build of cuda.core"
+        )
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
index f15d9242778..3d4b3e4c596 100644
--- a/cuda_core/cuda/core/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -1,7 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core._memory._managed_location import Location  # noqa: F401
+from cuda.core._memory._managed_memory_ops import (
+    advise,  # noqa: F401
+    discard,  # noqa: F401
+    discard_prefetch,  # noqa: F401
+    prefetch,  # noqa: F401
+)
 from cuda.core._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 88780732d54..fa17624fa5e 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -247,7 +247,12 @@ Utility functions
    :toctree: generated/
 
    args_viewable_as_strided_memory
+   advise
+   prefetch
+   discard
+   discard_prefetch
 
    :template: autosummary/cyclass.rst
 
+   Location
    StridedMemoryView
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 34eff571005..17696b616a1 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -16,7 +16,17 @@ Highlights
 New features
 ------------
 
-- TBD
+- Added managed-memory range operations to :mod:`cuda.core.utils`:
+  :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`,
+  :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each
+  operation accepts either a single managed :class:`Buffer` or a
+  sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the
+  corresponding ``cuMem*BatchAsync`` driver entry point, addressing the
+  managed-memory portion of #1333. Locations are expressed via the typed
+  :class:`~utils.Location` dataclass (with classmethod constructors
+  ``device``, ``host``, ``host_numa``, and ``host_numa_current``);
+  ``Device`` and ``int`` values are still accepted for ergonomic
+  compatibility.
 
 
 Fixes and enhancements
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 57de22bb9a0..18f7bed1141 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -38,6 +38,7 @@
     PinnedMemoryResourceOptions,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
+    utils,
 )
 from cuda.core import (
     system as ccx_system,
@@ -48,6 +49,11 @@
 from cuda.core.utils import StridedMemoryView
 
 POOL_SIZE = 2097152  # 2MB size
+_MANAGED_TEST_ALLOCATION_SIZE = 4096
+_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4
+_READ_MOSTLY_ENABLED = 1
+_HOST_LOCATION_ID = -1
+_INVALID_HOST_DEVICE_ORDINAL = 0
 
 
 class DummyDeviceMemoryResource(MemoryResource):
@@ -1192,6 +1198,277 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+def _get_mem_range_attr(buffer, attribute, data_size):
+    # cuMemRangeGetAttribute returns a raw integer when data_size <= 4.
+    return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size))
+
+
+def _get_int_mem_range_attr(buffer, attribute):
+    return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE)
+
+
+def _skip_if_managed_allocation_unsupported(device):
+    try:
+        if not device.properties.managed_memory:
+            pytest.skip("Device does not support managed memory operations")
+    except AttributeError:
+        pytest.skip("Managed-memory buffer operations require CUDA support")
+
+
+def _skip_if_managed_location_ops_unsupported(device):
+    _skip_if_managed_allocation_unsupported(device)
+    try:
+        if not device.properties.concurrent_managed_access:
+            pytest.skip("Device does not support concurrent managed memory access")
+    except AttributeError:
+        pytest.skip("Managed-memory location operations require CUDA support")
+
+
+def _skip_if_managed_discard_prefetch_unsupported(device):
+    _skip_if_managed_location_ops_unsupported(device)
+    if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+        pytest.skip("discard-prefetch requires cuda.bindings support")
+
+    visible_devices = Device.get_all_devices()
+    if not all(dev.properties.concurrent_managed_access for dev in visible_devices):
+        pytest.skip("discard-prefetch requires concurrent managed access on all visible devices")
+
+
+def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    stream.sync()
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == _HOST_LOCATION_ID
+
+    utils.prefetch(buffer, device, stream=stream)
+    stream.sync()
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
+def test_managed_memory_advise_supports_external_managed_allocations(init_cuda):
+    from cuda.core.utils import Location
+
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    utils.advise(buffer, "set_read_mostly")
+    assert (
+        _get_int_mem_range_attr(
+            buffer,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        )
+        == _READ_MOSTLY_ENABLED
+    )
+
+    # cuda.bindings currently exposes the combined location attributes for
+    # cuMemRangeGetAttribute, so use the legacy location query here.
+    utils.advise(buffer, "set_preferred_location", Location.host())
+    preferred_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
+    )
+    assert preferred_location == _HOST_LOCATION_ID
+
+    buffer.close()
+
+
+def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda):
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    utils.prefetch(buffer, device, stream=stream)
+    stream.sync()
+
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
+def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    _skip_if_managed_discard_prefetch_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    stream.sync()
+
+    utils.discard_prefetch(buffer, device, stream=stream)
+    stream.sync()
+
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
+def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda):
+    device = Device()
+    _skip_if_managed_discard_prefetch_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream)
+    stream.sync()
+
+    utils.discard_prefetch(buffer, device, stream=stream)
+    stream.sync()
+
+    last_location = _get_int_mem_range_attr(
+        buffer,
+        driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    )
+    assert last_location == device.device_id
+
+    buffer.close()
+
+
+def test_managed_memory_operations_reject_non_managed_allocations(init_cuda):
+    device = Device()
+    device.set_current()
+
+    buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        utils.advise(buffer, "set_read_mostly")
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        utils.prefetch(buffer, device, stream=stream)
+    with pytest.raises(ValueError, match="managed-memory allocation"):
+        utils.discard_prefetch(buffer, device, stream=stream)
+
+    buffer.close()
+
+
+def test_managed_memory_operation_validation(init_cuda):
+    device = Device()
+    skip_if_managed_memory_unsupported(device)
+    device.set_current()
+
+    mr = create_managed_memory_resource_or_skip()
+    buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+    stream = device.create_stream()
+
+    with pytest.raises(ValueError, match="location is required"):
+        utils.prefetch(buffer, stream=stream)
+    from cuda.core.utils import Location
+
+    with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
+        utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL))
+
+    buffer.close()
+
+
+def test_managed_memory_advise_location_validation(init_cuda):
+    """Verify doc-specified location constraints for each advice kind."""
+    from cuda.core.utils import Location
+
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    # set_read_mostly works without a location (location is ignored)
+    utils.advise(buffer, "set_read_mostly")
+
+    # set_preferred_location requires a location; device ordinal works
+    utils.advise(buffer, "set_preferred_location", device.device_id)
+
+    # set_preferred_location with host location
+    utils.advise(buffer, "set_preferred_location", Location.host())
+
+    # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs)
+    with pytest.raises(ValueError, match="does not support location_type='host_numa'"):
+        utils.advise(buffer, "set_accessed_by", Location.host_numa(0))
+
+    # set_accessed_by with host_numa_current also raises ValueError
+    with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"):
+        utils.advise(buffer, "set_accessed_by", Location.host_numa_current())
+
+    # Inferred location from int: -1 maps to host, 0 maps to device
+    utils.advise(buffer, "set_preferred_location", -1)
+    utils.advise(buffer, "set_preferred_location", 0)
+
+    buffer.close()
+
+
+def test_managed_memory_advise_accepts_enum_value(init_cuda):
+    """advise() accepts CUmem_advise enum values directly, not just string aliases."""
+    device = Device()
+    _skip_if_managed_location_ops_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY
+    utils.advise(buffer, advice_enum)
+
+    assert (
+        _get_int_mem_range_attr(
+            buffer,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        )
+        == _READ_MOSTLY_ENABLED
+    )
+
+    buffer.close()
+
+
+def test_managed_memory_advise_invalid_advice_values(init_cuda):
+    """advise() rejects invalid advice strings and wrong types."""
+    device = Device()
+    _skip_if_managed_allocation_unsupported(device)
+    device.set_current()
+
+    buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+
+    with pytest.raises(ValueError, match="advice must be one of"):
+        utils.advise(buffer, "not_a_real_advice")
+
+    with pytest.raises(TypeError, match="advice must be"):
+        utils.advise(buffer, 42)
+
+    buffer.close()
+
+
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch
@@ -1594,3 +1871,411 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
     assert buffer.handle >= 0
     assert buffer.size == 0
     assert buffer.device_id == mr.device_id
+
+
+class TestLocation:
+    def test_device_constructor(self):
+        from cuda.core.utils import Location
+
+        loc = Location.device(0)
+        assert loc.kind == "device"
+        assert loc.id == 0
+
+    def test_host_constructor(self):
+        from cuda.core.utils import Location
+
+        loc = Location.host()
+        assert loc.kind == "host"
+        assert loc.id is None
+
+    def test_host_numa_constructor(self):
+        from cuda.core.utils import Location
+
+        loc = Location.host_numa(3)
+        assert loc.kind == "host_numa"
+        assert loc.id == 3
+
+    def test_host_numa_current_constructor(self):
+        from cuda.core.utils import Location
+
+        loc = Location.host_numa_current()
+        assert loc.kind == "host_numa_current"
+        assert loc.id is None
+
+    def test_frozen(self):
+        import dataclasses
+
+        from cuda.core.utils import Location
+
+        loc = Location.device(0)
+        with pytest.raises(dataclasses.FrozenInstanceError):
+            loc.id = 1
+
+    def test_invalid_device_id(self):
+        from cuda.core.utils import Location
+
+        with pytest.raises(ValueError, match="device id must be >= 0"):
+            Location.device(-1)
+
+    def test_invalid_kind(self):
+        from cuda.core.utils import Location
+
+        with pytest.raises(ValueError, match="kind must be one of"):
+            Location(kind="not_a_kind", id=None)
+
+
+class TestLocationCoerce:
+    def test_passthrough(self):
+        from cuda.core._memory._managed_location import _coerce_location
+        from cuda.core.utils import Location
+
+        loc = Location.device(0)
+        assert _coerce_location(loc) is loc
+
+    def test_int_device(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        assert _coerce_location(0).kind == "device"
+        assert _coerce_location(0).id == 0
+
+    def test_int_minus_one_is_host(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        assert _coerce_location(-1).kind == "host"
+
+    def test_device_object(self, init_cuda):
+        from cuda.core import Device
+        from cuda.core._memory._managed_location import _coerce_location
+
+        dev = Device()
+        loc = _coerce_location(dev)
+        assert loc.kind == "device"
+        assert loc.id == dev.device_id
+
+    def test_none_when_disallowed(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        with pytest.raises(ValueError, match="location is required"):
+            _coerce_location(None, allow_none=False)
+
+    def test_none_when_allowed(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        assert _coerce_location(None, allow_none=True) is None
+
+    def test_bad_int(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        with pytest.raises(ValueError, match="device ordinal"):
+            _coerce_location(-2)
+
+    def test_bad_type(self):
+        from cuda.core._memory._managed_location import _coerce_location
+
+        with pytest.raises(TypeError, match="Location, Device, int, or None"):
+            _coerce_location("device")
+
+
+class TestPrefetch:
+    def test_single_with_location_host(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+
+        prefetch(buf, Location.host(), stream=stream)
+        stream.sync()
+        last = _get_int_mem_range_attr(
+            buf,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last == _HOST_LOCATION_ID
+        buf.close()
+
+    def test_batched_same_location(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
+            pytest.skip("cuMemPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)]
+        stream = device.create_stream()
+
+        prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+
+        for buf in bufs:
+            last = _get_int_mem_range_attr(
+                buf,
+                driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+            )
+            assert last == device.device_id
+            buf.close()
+
+    def test_batched_per_buffer_location(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemPrefetchBatchAsync"):
+            pytest.skip("cuMemPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+
+        prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream)
+        stream.sync()
+
+        last0 = _get_int_mem_range_attr(
+            bufs[0],
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        last1 = _get_int_mem_range_attr(
+            bufs[1],
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last0 == _HOST_LOCATION_ID
+        assert last1 == device.device_id
+        for buf in bufs:
+            buf.close()
+
+    def test_length_mismatch(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+
+        with pytest.raises(ValueError, match="length"):
+            prefetch(bufs, [Location.host()], stream=stream)
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            prefetch(buf, Location.host(), stream=stream)
+        buf.close()
+
+    def test_location_required(self, init_cuda):
+        from cuda.core.utils import prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="location is required"):
+            prefetch(buf, None, stream=stream)
+        buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.utils import Location, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(TypeError, match="must be None"):
+            prefetch(buf, Location.host(), options={}, stream=stream)
+        buf.close()
+
+
+class TestDiscard:
+    def test_single_buffer(self, init_cuda):
+        from cuda.core.utils import Location, discard, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardBatchAsync"):
+            pytest.skip("cuMemDiscardBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        prefetch(buf, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        discard(buf, stream=stream)
+        stream.sync()
+        buf.close()
+
+    def test_batched(self, init_cuda):
+        from cuda.core.utils import Location, discard, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardBatchAsync"):
+            pytest.skip("cuMemDiscardBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)]
+        stream = device.create_stream()
+        prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        discard(bufs, stream=stream)
+        stream.sync()
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.utils import discard
+
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            discard(buf, stream=stream)
+        buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.utils import discard
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(TypeError, match="must be None"):
+            discard(buf, options={}, stream=stream)
+        buf.close()
+
+
+class TestDiscardPrefetch:
+    def test_single_buffer(self, init_cuda):
+        from cuda.core.utils import Location, discard_prefetch, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+            pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+
+        prefetch(buf, Location.host(), stream=stream)
+        stream.sync()
+        discard_prefetch(buf, Location.device(device.device_id), stream=stream)
+        stream.sync()
+
+        last = _get_int_mem_range_attr(
+            buf,
+            driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+        )
+        assert last == device.device_id
+        buf.close()
+
+    def test_batched_same_location(self, init_cuda):
+        from cuda.core.utils import Location, discard_prefetch, prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"):
+            pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable")
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+        prefetch(bufs, Location.host(), stream=stream)
+        stream.sync()
+        discard_prefetch(bufs, Location.device(device.device_id), stream=stream)
+        stream.sync()
+        for buf in bufs:
+            last = _get_int_mem_range_attr(
+                buf,
+                driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+            )
+            assert last == device.device_id
+            buf.close()
+
+    def test_length_mismatch(self, init_cuda):
+        from cuda.core.utils import Location, discard_prefetch
+
+        device = Device()
+        skip_if_managed_memory_unsupported(device)
+        device.set_current()
+        mr = create_managed_memory_resource_or_skip()
+        bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="length"):
+            discard_prefetch(bufs, [Location.host()], stream=stream)
+        for buf in bufs:
+            buf.close()
+
+    def test_rejects_non_managed(self, init_cuda):
+        from cuda.core.utils import Location, discard_prefetch
+
+        device = Device()
+        device.set_current()
+        buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        stream = device.create_stream()
+        with pytest.raises(ValueError, match="managed-memory"):
+            discard_prefetch(buf, Location.host(), stream=stream)
+        buf.close()
+
+
+class TestAdvise:
+    def test_batched_same_advice(self, init_cuda):
+        from cuda.core.utils import advise
+
+        device = Device()
+        _skip_if_managed_location_ops_unsupported(device)
+        device.set_current()
+        bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        advise(bufs, "set_read_mostly")
+        for buf in bufs:
+            assert (
+                _get_int_mem_range_attr(
+                    buf,
+                    driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+                )
+                == _READ_MOSTLY_ENABLED
+            )
+            buf.close()
+
+    def test_batched_per_buffer_location(self, init_cuda):
+        from cuda.core.utils import Location, advise
+
+        device = Device()
+        _skip_if_managed_location_ops_unsupported(device)
+        device.set_current()
+        bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)]
+        advise(
+            bufs,
+            "set_preferred_location",
+            [Location.host(), Location.device(device.device_id)],
+        )
+        for buf in bufs:
+            buf.close()
+
+    def test_options_must_be_none(self, init_cuda):
+        from cuda.core.utils import advise
+
+        device = Device()
+        _skip_if_managed_allocation_unsupported(device)
+        device.set_current()
+        buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE)
+        with pytest.raises(TypeError, match="must be None"):
+            advise(buf, "set_read_mostly", options={})
+        buf.close()