Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
SearchContext,
derive_ctk_root,
find_via_ctk_root,
find_via_path_override,
run_find_steps,
)
from cuda.pathfinder._dynamic_libs.subprocess_protocol import (
Expand Down Expand Up @@ -60,8 +61,13 @@ def _load_driver_lib_no_cache(desc: LibDescriptor) -> LoadedDL:
Driver libs (libcuda, libnvidia-ml) are part of the display driver, not
the CUDA Toolkit. They are expected to be discoverable via the platform's
native loader mechanisms, so the full CTK search cascade (site-packages,
conda, CUDA_PATH, canary) is unnecessary.
conda, CUDA_PATH, canary) is unnecessary. The per-library override env
var is still honored so developers can point at a custom build.
"""
override_ctx = SearchContext(desc)
override_find = find_via_path_override(override_ctx)
if override_find is not None:
return LOADER.load_with_abs_path(desc, override_find.abs_path, override_find.found_via)
loaded = LOADER.check_if_already_loaded_from_elsewhere(desc, False)
if loaded is not None:
return loaded
Expand Down Expand Up @@ -221,23 +227,37 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
RuntimeError: If Python is not 64-bit.

Search order:
0. **Already loaded in the current process**
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm almost certain we have to keep this first.

0. **Already loaded in the current process**

Otherwise we could have undefined behavior. It's a while ago that I looked into the details of this. If there are strong reasons, I could try to dig into this question again, but I hope it's acceptable to keep it first.

0. **Per-library path override (developer escape hatch)**

- If ``CUDA_PATHFINDER_<LIBNAME_UPPER>_PATH_OVERRIDE`` is set, it
takes precedence over every other source. The value may be either
an absolute path to the library file or a directory containing it
(searched with the same logic as other anchor-based steps). For
``libname="nvshmem_host"`` the variable is
``CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE``.

If the override is set but the library cannot be resolved from
it, the load fails immediately rather than silently falling
through. This makes the override behavior explicit and easy to
debug.

1. **Already loaded in the current process**

- If a matching library is already loaded by some other component,
return its absolute path and handle and skip the rest of the search.

1. **NVIDIA Python wheels**
2. **NVIDIA Python wheels**

- Scan installed distributions (``site-packages``) to find libraries
shipped in NVIDIA wheels.

2. **Conda environment**
3. **Conda environment**

- Conda installations are discovered via ``CONDA_PREFIX``, which is
defined automatically in activated conda environments (see
https://docs.conda.io/projects/conda-build/en/stable/user-guide/environment-variables.html).

3. **OS default mechanisms**
4. **OS default mechanisms**

- Fall back to the native loader:

Expand All @@ -253,30 +273,31 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
As a result, the native DLL search used here does **not** include
the system ``PATH``.

4. **Environment variables**
5. **Environment variables**

- If set, use ``CUDA_PATH`` or ``CUDA_HOME`` (in that order).
On Windows, this is the typical way system-installed CTK DLLs are
located. Note that the NVIDIA CTK installer automatically
adds ``CUDA_PATH`` to the system-wide environment.

5. **CTK root canary probe (discoverable libs only)**
6. **CTK root canary probe (discoverable libs only)**

- For selected libraries whose shared object doesn't reside on the
standard linker path (currently ``nvvm``), attempt to derive CTK
root by system-loading a well-known CTK canary library in a
subprocess and then searching relative to that root. On Windows,
the canary uses the same native ``LoadLibraryExW`` semantics as
step 3, so there is also no ``PATH``-based discovery.
step 4, so there is also no ``PATH``-based discovery.

**Driver libraries** (``"cuda"``, ``"nvml"``):

These are part of the NVIDIA display driver (not the CUDA Toolkit) and
are expected to be reachable via the native OS loader path. For these
libraries the search is simplified to:

0. Already loaded in the current process
1. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``)
0. Per-library path override (``CUDA_PATHFINDER_<LIBNAME>_PATH_OVERRIDE``)
1. Already loaded in the current process
2. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``)

The CTK-specific steps (site-packages, conda, ``CUDA_PATH``, canary
probe) are skipped entirely.
Expand Down
50 changes: 49 additions & 1 deletion cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,52 @@ def find_via_ctk_root(ctx: SearchContext, ctk_root: str) -> FindResult | None:
# ---------------------------------------------------------------------------


_PATH_OVERRIDE_ENV_PREFIX = "CUDA_PATHFINDER_"
_PATH_OVERRIDE_ENV_SUFFIX = "_PATH_OVERRIDE"


def path_override_env_var(libname: str) -> str:
"""Return the per-library override environment variable name."""
return f"{_PATH_OVERRIDE_ENV_PREFIX}{libname.upper()}{_PATH_OVERRIDE_ENV_SUFFIX}"


def find_via_path_override(ctx: SearchContext) -> FindResult | None:
"""Resolve a library via the per-library override environment variable.

The variable name is ``CUDA_PATHFINDER_<LIBNAME_UPPER>_PATH_OVERRIDE``.

Value semantics:
- Unset or empty: this step is a no-op and returns ``None``.
- Path to an existing regular file: used as the resolved library file.
- Path to an existing directory: searched for the library file using the
same platform logic as other anchor-based steps.
- Anything else (path does not exist, directory has no matching library):
raises :class:`DynamicLibNotFoundError`. An explicit override that fails
to resolve must not silently fall through to other search steps.
"""
env_var = path_override_env_var(ctx.libname)
override = os.environ.get(env_var)
if not override:
return None

found_via = f"override({env_var})"

if os.path.isfile(override):
return FindResult(os.path.normpath(override), found_via)

if os.path.isdir(override):
abs_path = _find_using_lib_dir(ctx, override)
if abs_path is not None:
return FindResult(abs_path, found_via)
err = ", ".join(ctx.error_messages) or f"no matching file under {override!r}"
att = "\n".join(ctx.attachments)
raise DynamicLibNotFoundError(
f"{env_var}={override!r} is set but {ctx.lib_searched_for!r} was not found there: {err}\n{att}"
)

raise DynamicLibNotFoundError(f"{env_var}={override!r} is set but the path does not exist as a file or directory.")


def find_in_site_packages(ctx: SearchContext) -> FindResult | None:
"""Search pip wheel install locations."""
rel_dirs = ctx.platform.site_packages_rel_dirs(ctx.desc)
Expand Down Expand Up @@ -208,7 +254,9 @@ def find_in_cuda_path(ctx: SearchContext) -> FindResult | None:
# ---------------------------------------------------------------------------

#: Find steps that run before the already-loaded check and system search.
EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_in_site_packages, find_in_conda)
#: The path-override step has the highest priority and fails loudly if the
#: override env var is set but the library cannot be resolved from it.
EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_via_path_override, find_in_site_packages, find_in_conda)

#: Find steps that run after system search fails.
LATE_FIND_STEPS: tuple[FindStep, ...] = (find_in_cuda_path,)
Expand Down
66 changes: 66 additions & 0 deletions cuda_pathfinder/tests/test_search_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
find_in_conda,
find_in_cuda_path,
find_in_site_packages,
find_via_path_override,
path_override_env_var,
run_find_steps,
)

Expand Down Expand Up @@ -440,3 +442,67 @@ def test_nvvm_cuda_home_linux(self, mocker, tmp_path):
assert result is not None
assert result.abs_path == str(so_file)
assert result.found_via == "CUDA_PATH"


# ---------------------------------------------------------------------------
# find_via_path_override
# ---------------------------------------------------------------------------


class TestPathOverrideEnvVar:
def test_uppercases_libname(self):
assert path_override_env_var("cudart") == "CUDA_PATHFINDER_CUDART_PATH_OVERRIDE"

def test_preserves_underscore(self):
assert path_override_env_var("nvshmem_host") == "CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE"

def test_uppercases_mixed_case(self):
assert path_override_env_var("cublasLt") == "CUDA_PATHFINDER_CUBLASLT_PATH_OVERRIDE"


class TestFindViaPathOverride:
def test_unset_returns_none(self, monkeypatch):
monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False)
assert find_via_path_override(_ctx()) is None

def test_empty_returns_none(self, monkeypatch):
monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", "")
assert find_via_path_override(_ctx()) is None

def test_file_path_used_as_is(self, monkeypatch, tmp_path):
so_file = tmp_path / "libcudart.so.99"
so_file.touch()
monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(so_file))
result = find_via_path_override(_ctx())
assert result is not None
assert result.abs_path == os.path.normpath(str(so_file))
assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)"

def test_directory_searched_linux(self, monkeypatch, tmp_path):
so_file = tmp_path / "libcudart.so"
so_file.touch()
monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path))
result = find_via_path_override(_ctx(platform=LinuxSearchPlatform()))
assert result is not None
assert result.abs_path == str(so_file)
assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)"

def test_nonexistent_path_raises(self, monkeypatch, tmp_path):
bogus = tmp_path / "does-not-exist"
monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(bogus))
with pytest.raises(DynamicLibNotFoundError, match="does not exist"):
find_via_path_override(_ctx())

def test_directory_without_lib_raises(self, monkeypatch, tmp_path):
monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path))
with pytest.raises(DynamicLibNotFoundError, match="was not found there"):
find_via_path_override(_ctx(platform=LinuxSearchPlatform()))

def test_per_lib_isolation(self, monkeypatch, tmp_path):
# Override for nvshmem_host must not affect cudart lookups.
monkeypatch.setenv("CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE", str(tmp_path / "nope"))
monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False)
assert find_via_path_override(_ctx()) is None

def test_runs_first_in_early_steps(self):
assert EARLY_FIND_STEPS[0] is find_via_path_override
Loading