diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py index a7a8965d2e..8c6c084549 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py @@ -24,6 +24,7 @@ SearchContext, derive_ctk_root, find_via_ctk_root, + find_via_path_override, run_find_steps, ) from cuda.pathfinder._dynamic_libs.subprocess_protocol import ( @@ -60,8 +61,13 @@ def _load_driver_lib_no_cache(desc: LibDescriptor) -> LoadedDL: Driver libs (libcuda, libnvidia-ml) are part of the display driver, not the CUDA Toolkit. They are expected to be discoverable via the platform's native loader mechanisms, so the full CTK search cascade (site-packages, - conda, CUDA_PATH, canary) is unnecessary. + conda, CUDA_PATH, canary) is unnecessary. The per-library override env + var is still honored so developers can point at a custom build. """ + override_ctx = SearchContext(desc) + override_find = find_via_path_override(override_ctx) + if override_find is not None: + return LOADER.load_with_abs_path(desc, override_find.abs_path, override_find.found_via) loaded = LOADER.check_if_already_loaded_from_elsewhere(desc, False) if loaded is not None: return loaded @@ -221,23 +227,37 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: RuntimeError: If Python is not 64-bit. Search order: - 0. **Already loaded in the current process** + 0. **Per-library path override (developer escape hatch)** + + - If ``CUDA_PATHFINDER__PATH_OVERRIDE`` is set, it + takes precedence over every other source. The value may be either + an absolute path to the library file or a directory containing it + (searched with the same logic as other anchor-based steps). For + ``libname="nvshmem_host"`` the variable is + ``CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE``. + + If the override is set but the library cannot be resolved from + it, the load fails immediately rather than silently falling + through. This makes the override behavior explicit and easy to + debug. + + 1. **Already loaded in the current process** - If a matching library is already loaded by some other component, return its absolute path and handle and skip the rest of the search. - 1. **NVIDIA Python wheels** + 2. **NVIDIA Python wheels** - Scan installed distributions (``site-packages``) to find libraries shipped in NVIDIA wheels. - 2. **Conda environment** + 3. **Conda environment** - Conda installations are discovered via ``CONDA_PREFIX``, which is defined automatically in activated conda environments (see https://docs.conda.io/projects/conda-build/en/stable/user-guide/environment-variables.html). - 3. **OS default mechanisms** + 4. **OS default mechanisms** - Fall back to the native loader: @@ -253,21 +273,21 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: As a result, the native DLL search used here does **not** include the system ``PATH``. - 4. **Environment variables** + 5. **Environment variables** - If set, use ``CUDA_PATH`` or ``CUDA_HOME`` (in that order). On Windows, this is the typical way system-installed CTK DLLs are located. Note that the NVIDIA CTK installer automatically adds ``CUDA_PATH`` to the system-wide environment. - 5. **CTK root canary probe (discoverable libs only)** + 6. **CTK root canary probe (discoverable libs only)** - For selected libraries whose shared object doesn't reside on the standard linker path (currently ``nvvm``), attempt to derive CTK root by system-loading a well-known CTK canary library in a subprocess and then searching relative to that root. On Windows, the canary uses the same native ``LoadLibraryExW`` semantics as - step 3, so there is also no ``PATH``-based discovery. + step 4, so there is also no ``PATH``-based discovery. **Driver libraries** (``"cuda"``, ``"nvml"``): @@ -275,8 +295,9 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: are expected to be reachable via the native OS loader path. For these libraries the search is simplified to: - 0. Already loaded in the current process - 1. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``) + 0. Per-library path override (``CUDA_PATHFINDER__PATH_OVERRIDE``) + 1. Already loaded in the current process + 2. OS default mechanisms (``dlopen`` / ``LoadLibraryExW``) The CTK-specific steps (site-packages, conda, ``CUDA_PATH``, canary probe) are skipped entirely. diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py index 55d8a8aa67..d414b85243 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_steps.py @@ -158,6 +158,52 @@ def find_via_ctk_root(ctx: SearchContext, ctk_root: str) -> FindResult | None: # --------------------------------------------------------------------------- +_PATH_OVERRIDE_ENV_PREFIX = "CUDA_PATHFINDER_" +_PATH_OVERRIDE_ENV_SUFFIX = "_PATH_OVERRIDE" + + +def path_override_env_var(libname: str) -> str: + """Return the per-library override environment variable name.""" + return f"{_PATH_OVERRIDE_ENV_PREFIX}{libname.upper()}{_PATH_OVERRIDE_ENV_SUFFIX}" + + +def find_via_path_override(ctx: SearchContext) -> FindResult | None: + """Resolve a library via the per-library override environment variable. + + The variable name is ``CUDA_PATHFINDER__PATH_OVERRIDE``. + + Value semantics: + - Unset or empty: this step is a no-op and returns ``None``. + - Path to an existing regular file: used as the resolved library file. + - Path to an existing directory: searched for the library file using the + same platform logic as other anchor-based steps. + - Anything else (path does not exist, directory has no matching library): + raises :class:`DynamicLibNotFoundError`. An explicit override that fails + to resolve must not silently fall through to other search steps. + """ + env_var = path_override_env_var(ctx.libname) + override = os.environ.get(env_var) + if not override: + return None + + found_via = f"override({env_var})" + + if os.path.isfile(override): + return FindResult(os.path.normpath(override), found_via) + + if os.path.isdir(override): + abs_path = _find_using_lib_dir(ctx, override) + if abs_path is not None: + return FindResult(abs_path, found_via) + err = ", ".join(ctx.error_messages) or f"no matching file under {override!r}" + att = "\n".join(ctx.attachments) + raise DynamicLibNotFoundError( + f"{env_var}={override!r} is set but {ctx.lib_searched_for!r} was not found there: {err}\n{att}" + ) + + raise DynamicLibNotFoundError(f"{env_var}={override!r} is set but the path does not exist as a file or directory.") + + def find_in_site_packages(ctx: SearchContext) -> FindResult | None: """Search pip wheel install locations.""" rel_dirs = ctx.platform.site_packages_rel_dirs(ctx.desc) @@ -208,7 +254,9 @@ def find_in_cuda_path(ctx: SearchContext) -> FindResult | None: # --------------------------------------------------------------------------- #: Find steps that run before the already-loaded check and system search. -EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_in_site_packages, find_in_conda) +#: The path-override step has the highest priority and fails loudly if the +#: override env var is set but the library cannot be resolved from it. +EARLY_FIND_STEPS: tuple[FindStep, ...] = (find_via_path_override, find_in_site_packages, find_in_conda) #: Find steps that run after system search fails. LATE_FIND_STEPS: tuple[FindStep, ...] = (find_in_cuda_path,) diff --git a/cuda_pathfinder/tests/test_search_steps.py b/cuda_pathfinder/tests/test_search_steps.py index 1b881707df..166922a2b0 100644 --- a/cuda_pathfinder/tests/test_search_steps.py +++ b/cuda_pathfinder/tests/test_search_steps.py @@ -21,6 +21,8 @@ find_in_conda, find_in_cuda_path, find_in_site_packages, + find_via_path_override, + path_override_env_var, run_find_steps, ) @@ -440,3 +442,67 @@ def test_nvvm_cuda_home_linux(self, mocker, tmp_path): assert result is not None assert result.abs_path == str(so_file) assert result.found_via == "CUDA_PATH" + + +# --------------------------------------------------------------------------- +# find_via_path_override +# --------------------------------------------------------------------------- + + +class TestPathOverrideEnvVar: + def test_uppercases_libname(self): + assert path_override_env_var("cudart") == "CUDA_PATHFINDER_CUDART_PATH_OVERRIDE" + + def test_preserves_underscore(self): + assert path_override_env_var("nvshmem_host") == "CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE" + + def test_uppercases_mixed_case(self): + assert path_override_env_var("cublasLt") == "CUDA_PATHFINDER_CUBLASLT_PATH_OVERRIDE" + + +class TestFindViaPathOverride: + def test_unset_returns_none(self, monkeypatch): + monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False) + assert find_via_path_override(_ctx()) is None + + def test_empty_returns_none(self, monkeypatch): + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", "") + assert find_via_path_override(_ctx()) is None + + def test_file_path_used_as_is(self, monkeypatch, tmp_path): + so_file = tmp_path / "libcudart.so.99" + so_file.touch() + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(so_file)) + result = find_via_path_override(_ctx()) + assert result is not None + assert result.abs_path == os.path.normpath(str(so_file)) + assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)" + + def test_directory_searched_linux(self, monkeypatch, tmp_path): + so_file = tmp_path / "libcudart.so" + so_file.touch() + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path)) + result = find_via_path_override(_ctx(platform=LinuxSearchPlatform())) + assert result is not None + assert result.abs_path == str(so_file) + assert result.found_via == "override(CUDA_PATHFINDER_CUDART_PATH_OVERRIDE)" + + def test_nonexistent_path_raises(self, monkeypatch, tmp_path): + bogus = tmp_path / "does-not-exist" + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(bogus)) + with pytest.raises(DynamicLibNotFoundError, match="does not exist"): + find_via_path_override(_ctx()) + + def test_directory_without_lib_raises(self, monkeypatch, tmp_path): + monkeypatch.setenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", str(tmp_path)) + with pytest.raises(DynamicLibNotFoundError, match="was not found there"): + find_via_path_override(_ctx(platform=LinuxSearchPlatform())) + + def test_per_lib_isolation(self, monkeypatch, tmp_path): + # Override for nvshmem_host must not affect cudart lookups. + monkeypatch.setenv("CUDA_PATHFINDER_NVSHMEM_HOST_PATH_OVERRIDE", str(tmp_path / "nope")) + monkeypatch.delenv("CUDA_PATHFINDER_CUDART_PATH_OVERRIDE", raising=False) + assert find_via_path_override(_ctx()) is None + + def test_runs_first_in_early_steps(self): + assert EARLY_FIND_STEPS[0] is find_via_path_override