From 86adb00afba8165d6d1a40d4979feba8edc19e0c Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Sun, 26 Apr 2026 11:48:36 -0700 Subject: [PATCH 1/2] [FEA]: Add per-library path override env var to cuda.pathfinder Adds CUDA_PATHFINDER__PATH_OVERRIDE as a developer escape hatch for pointing cuda.pathfinder at a custom build of a specific library (e.g. a development branch of nvshmem) without having to remove the wheel or copy .so files into site-packages. The value can be either an absolute file path (used as-is) or a directory (searched with the same platform logic as conda / CUDA_PATH). The override has the highest priority and applies uniformly to CTK, third-party, and driver libraries. If the env var is set but the library cannot be resolved from it, the load fails immediately rather than silently falling through to other search steps. This keeps the override behavior explicit and easy to debug. Closes #1054. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../_dynamic_libs/load_nvidia_dynamic_lib.py | 41 +++++++++--- .../pathfinder/_dynamic_libs/search_steps.py | 52 ++++++++++++++- cuda_pathfinder/tests/test_search_steps.py | 66 +++++++++++++++++++ 3 files changed, 148 insertions(+), 11 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py index a7a8965d2e8..8c6c084549a 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py @@ -24,6 +24,7 @@ SearchContext, derive_ctk_root, find_via_ctk_root, + find_via_path_override, run_find_steps, ) from cuda.pathfinder._dynamic_libs.subprocess_protocol import ( @@ -60,8 +61,13 @@ def _load_driver_lib_no_cache(desc: LibDescriptor) -> LoadedDL: Driver libs (libcuda, libnvidia-ml) are part of the display driver, not the CUDA Toolkit. They are expected to be discoverable via the platform's native loader mechanisms, so the full CTK search cascade (site-packages, - conda, CUDA_PATH, canary) is unnecessary. + conda, CUDA_PATH, canary) is unnecessary. The per-library override env + var is still honored so developers can point at a custom build. """ + override_ctx = SearchContext(desc) + override_find = find_via_path_override(override_ctx) + if override_find is not None: + return LOADER.load_with_abs_path(desc, override_find.abs_path, override_find.found_via) loaded = LOADER.check_if_already_loaded_from_elsewhere(desc, False) if loaded is not None: return loaded @@ -221,23 +227,37 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: RuntimeError: If Python is not 64-bit. Search order: - 0. **Already loaded in the current process** + 0. **Per-library path override (developer escape hatch)** + + - If ``CUDA_PATHFINDER__PATH_OVERRIDE`` is set, it + takes precedence over every other source. The value may be either + an absolute path to the library file or a directory containing it + (searched with the same logic as other anchor-based steps). For + ``libname="nvshmem_host"`` the variable is + ``CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE``. + + If the override is set but the library cannot be resolved from + it, the load fails immediately rather than silently falling + through. This makes the override behavior explicit and easy to + debug. + + 1. **Already loaded in the current process** - If a matching library is already loaded by some other component, return its absolute path and handle and skip the rest of the search. - 1. **NVIDIA Python wheels** + 2. **NVIDIA Python wheels** - Scan installed distributions (``site-packages``) to find libraries shipped in NVIDIA wheels. - 2. **Conda environment** + 3. **Conda environment** - Conda installations are discovered via ``CONDA_PREFIX``, which is defined automatically in activated conda environments (see https://docs.conda.io/projects/conda-build/en/stable/user-guide/environment-variables.html). - 3. **OS default mechanisms** + 4. **OS default mechanisms** - Fall back to the native loader: @@ -253,21 +273,21 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: As a result, the native DLL search used here does **not** include the system ``PATH``. - 4. **Environment variables** + 5. **Environment variables** - If set, use ``CUDA_PATH`` or ``CUDA_HOME`` (in that order). On Windows, this is the typical way system-installed CTK DLLs are located. Note that the NVIDIA CTK installer automatically adds ``CUDA_PATH`` to the system-wide environment. - 5. **CTK root canary probe (discoverable libs only)** + 6. **CTK root canary probe (discoverable libs only)** - For selected libraries whose shared object doesn't reside on the standard linker path (currently ``nvvm``), attempt to derive CTK root by system-loading a well-known CTK canary library in a subprocess and then searching relative to that root. On Windows, the canary uses the same native ``LoadLibraryExW`` semantics as - step 3, so there is also no ``PATH``-based discovery. + step 4, so there is also no ``PATH``-based discovery. **Driver libraries** (``"cuda"``, ``"nvml"``): @@ -275,8 +295,9 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: are expected to be reachable via the native OS loader path. For these libraries the search is simplified to: - 0. Already loaded in the current process - 1. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``) + 0. Per-library path override (``CUDA_PATHFINDER__PATH_OVERRIDE``) + 1. Already loaded in the current process + 2. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``) The CTK-specific steps (site-packages, conda, ``CUDA_PATH``, canary probe) are skipped entirely. diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py index 55d8a8aa674..fbecbaed6a1 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py @@ -158,6 +158,54 @@ def find_via_ctk_root(ctx: SearchContext, ctk_root: str) -> FindResult | None: # --------------------------------------------------------------------------- +_PATH_OVERRIDE_ENV_PREFIX = "CUDA_PATHFINDER_" +_PATH_OVERRIDE_ENV_SUFFIX = "_PATH_OVERRIDE" + + +def path_override_env_var(libname: str) -> str: + """Return the per-library override environment variable name.""" + return f"{_PATH_OVERRIDE_ENV_PREFIX}{libname.upper()}{_PATH_OVERRIDE_ENV_SUFFIX}" + + +def find_via_path_override(ctx: SearchContext) -> FindResult | None: + """Resolve a library via the per-library override environment variable. + + The variable name is ``CUDA_PATHFINDER__PATH_OVERRIDE``. + + Value semantics: + - Unset or empty: this step is a no-op and returns ``None``. + - Path to an existing regular file: used as the resolved library file. + - Path to an existing directory: searched for the library file using the + same platform logic as other anchor-based steps. + - Anything else (path does not exist, directory has no matching library): + raises :class:`DynamicLibNotFoundError`. An explicit override that fails + to resolve must not silently fall through to other search steps. + """ + env_var = path_override_env_var(ctx.libname) + override = os.environ.get(env_var) + if not override: + return None + + found_via = f"override({env_var})" + + if os.path.isfile(override): + return FindResult(os.path.normpath(override), found_via) + + if os.path.isdir(override): + abs_path = _find_using_lib_dir(ctx, override) + if abs_path is not None: + return FindResult(abs_path, found_via) + err = ", ".join(ctx.error_messages) or f"no matching file under {override!r}" + att = "\n".join(ctx.attachments) + raise DynamicLibNotFoundError( + f'{env_var}={override!r} is set but {ctx.lib_searched_for!r} was not found there: {err}\n{att}' + ) + + raise DynamicLibNotFoundError( + f'{env_var}={override!r} is set but the path does not exist as a file or directory.' + ) + + def find_in_site_packages(ctx: SearchContext) -> FindResult | None: """Search pip wheel install locations.""" rel_dirs = ctx.platform.site_packages_rel_dirs(ctx.desc) @@ -208,7 +256,9 @@ def find_in_cuda_path(ctx: SearchContext) -> FindResult | None: # --------------------------------------------------------------------------- #: Find steps that run before the already-loaded check and system search. -EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_in_site_packages, find_in_conda) +#: The path-override step has the highest priority and fails loudly if the +#: override env var is set but the library cannot be resolved from it. +EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_via_path_override, find_in_site_packages, find_in_conda) #: Find steps that run after system search fails. LATE_FIND_STEPS: tuple[FindStep, ...] = (find_in_cuda_path,) diff --git a/cuda_pathfinder/tests/test_search_steps.py b/cuda_pathfinder/tests/test_search_steps.py index 1b881707dfb..166922a2b01 100644 --- a/cuda_pathfinder/tests/test_search_steps.py +++ b/cuda_pathfinder/tests/test_search_steps.py @@ -21,6 +21,8 @@ find_in_conda, find_in_cuda_path, find_in_site_packages, + find_via_path_override, + path_override_env_var, run_find_steps, ) @@ -440,3 +442,67 @@ def test_nvvm_cuda_home_linux(self, mocker, tmp_path): assert result is not None assert result.abs_path == str(so_file) assert result.found_via == "CUDA_PATH" + + +# --------------------------------------------------------------------------- +# find_via_path_override +# --------------------------------------------------------------------------- + + +class TestPathOverrideEnvVar: + def test_uppercases_libname(self): + assert path_override_env_var("cudart") == "CUDA_PATHFINDER_CUDART_PATH_OVERRIDE" + + def test_preserves_underscore(self): + assert path_override_env_var("nvshmem_host") == "CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE" + + def test_uppercases_mixed_case(self): + assert path_override_env_var("cublasLt") == "CUDA_PATHFINDER_CUBLASLT_PATH_OVERRIDE" + + +class TestFindViaPathOverride: + def test_unset_returns_none(self, monkeypatch): + monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False) + assert find_via_path_override(_ctx()) is None + + def test_empty_returns_none(self, monkeypatch): + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", "") + assert find_via_path_override(_ctx()) is None + + def test_file_path_used_as_is(self, monkeypatch, tmp_path): + so_file = tmp_path / "libcudart.so.99" + so_file.touch() + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(so_file)) + result = find_via_path_override(_ctx()) + assert result is not None + assert result.abs_path == os.path.normpath(str(so_file)) + assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)" + + def test_directory_searched_linux(self, monkeypatch, tmp_path): + so_file = tmp_path / "libcudart.so" + so_file.touch() + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path)) + result = find_via_path_override(_ctx(platform=LinuxSearchPlatform())) + assert result is not None + assert result.abs_path == str(so_file) + assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)" + + def test_nonexistent_path_raises(self, monkeypatch, tmp_path): + bogus = tmp_path / "does-not-exist" + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(bogus)) + with pytest.raises(DynamicLibNotFoundError, match="does not exist"): + find_via_path_override(_ctx()) + + def test_directory_without_lib_raises(self, monkeypatch, tmp_path): + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path)) + with pytest.raises(DynamicLibNotFoundError, match="was not found there"): + find_via_path_override(_ctx(platform=LinuxSearchPlatform())) + + def test_per_lib_isolation(self, monkeypatch, tmp_path): + # Override for nvshmem_host must not affect cudart lookups. + monkeypatch.setenv("CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE", str(tmp_path / "nope")) + monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False) + assert find_via_path_override(_ctx()) is None + + def test_runs_first_in_early_steps(self): + assert EARLY_FIND_STEPS[0] is find_via_path_override From 23af33dc61084ad7c5df49152e88aad412e4fad2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Sun, 26 Apr 2026 11:54:17 -0700 Subject: [PATCH 2/2] style: apply ruff auto-fixes Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/pathfinder/_dynamic_libs/search_steps.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py index fbecbaed6a1..d414b852437 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py @@ -198,12 +198,10 @@ def find_via_path_override(ctx: SearchContext) -> FindResult | None: err = ", ".join(ctx.error_messages) or f"no matching file under {override!r}" att = "\n".join(ctx.attachments) raise DynamicLibNotFoundError( - f'{env_var}={override!r} is set but {ctx.lib_searched_for!r} was not found there: {err}\n{att}' + f"{env_var}={override!r} is set but {ctx.lib_searched_for!r} was not found there: {err}\n{att}" ) - raise DynamicLibNotFoundError( - f'{env_var}={override!r} is set but the path does not exist as a file or directory.' - ) + raise DynamicLibNotFoundError(f"{env_var}={override!r} is set but the path does not exist as a file or directory.") def find_in_site_packages(ctx: SearchContext) -> FindResult | None: