From 1fad5517a4cde3ec001b2995ae54da445a165299 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 8 May 2026 15:27:52 +0200 Subject: [PATCH 1/5] Allow for CUDA driver minor version compatibility --- create_lmodsitepackage.py | 126 ++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index bb8b49b3..c9abe976 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -161,72 +161,78 @@ local checkGpu = mt:haveProperty(simpleName,"arch","gpu") local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK") if checkGpu and (overrideGpuCheck == nil) then - local eessi_version = os.getenv('EESSI_VERSION') or "" - local eessi_eprefix = os.getenv("EESSI_EPREFIX") or "" + local eessi_version = os.getenv('EESSI_VERSION') + local eessi_eprefix = os.getenv("EESSI_EPREFIX") if eessi_eprefix == nil or eessi_version == nil then LmodError("EESSI_VERSION and EESSI_EPREFIX must be defined for GPU driver check to work\\n") end - local cudaDriverDir = nil - if eessi_version == "2023.06" then - cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib" - else - cudaDriverDir = eessi_eprefix .. "/lib/nvidia" - end - local cudaDriverFile = cudaDriverDir .. "/libcuda.so" - local cudaDriverExists = isFile(cudaDriverFile) - local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") - if not (cudaDriverExists or singularityCudaExists) then - local advice = "which relies on the CUDA runtime environment and driver libraries. " - advice = advice .. "In order to be able to use the module, you will need " - advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. " - advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n" - advice = advice .. "You can override this check by setting the environment variable " - advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but " - advice = advice .. "the loaded application will not be able to execute on your system.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYou requested to load ", simpleName, " ", advice) - else - -- CUDA driver exists, now we check its version to see if an update is needed - if cudaDriverExists then - local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") - if not cudaVersion or cudaVersion == "" then - local eessi_prefix = os.getenv("EESSI_PREFIX") - local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') - source_sh("bash", script) - end - cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") - local cudaVersion_req = os.getenv("EESSICUDAVERSION") - -- Account for the fact that the script sourced above was designed to never return a non-zero exit code, - -- even if it fails to set EESSI_CUDA_DRIVER_VERSION - -- Essentially, we handle that case here by raising an error, which can be suppressed - if not cudaVersion or cudaVersion == "" then - local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" - local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " - warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" - warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " - warn = warn .. "as expected. Export " .. suppress_var .. "=1" - local suppress_warn = os.getenv(suppress_var) - if not suppress_warn or suppress_warn == 1 then - LmodWarning(warn) + -- Having EESSICUDAVERSION set means we have an NVIDIA accelerator + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + if cudaVersion_req then + local cudaDriverDir = nil + if eessi_version == "2023.06" then + cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib" + else + cudaDriverDir = eessi_eprefix .. "/lib/nvidia" + end + local cudaDriverFile = cudaDriverDir .. "/libcuda.so" + local cudaDriverExists = isFile(cudaDriverFile) + local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") + if not (cudaDriverExists or singularityCudaExists) then + local advice = "which relies on the CUDA runtime environment and driver libraries. " + advice = advice .. "In order to be able to use the module, you will need " + advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. " + advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n" + advice = advice .. "You can override this check by setting the environment variable " + advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but " + advice = advice .. "the loaded application will not be able to execute on your system.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) + else + -- CUDA driver exists, now we check its version to see if an update is needed + if cudaDriverExists then + local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") + if not cudaVersion or cudaVersion == "" then + local eessi_prefix = os.getenv("EESSI_PREFIX") + local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') + source_sh("bash", script) end - else - -- driver CUDA versions don't give a patch version for CUDA - local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") - local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") - local driver_libs_need_update = false - if tonumber(major) < tonumber(major_req) then - driver_libs_need_update = true - elseif tonumber(major) == tonumber(major_req) then - if tonumber(minor) < tonumber(minor_req) then + cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") + -- Account for the fact that the script sourced above was designed to never return a non-zero exit code, + -- even if it fails to set EESSI_CUDA_DRIVER_VERSION + -- Essentially, we handle that case here by raising an error, which can be suppressed + if not cudaVersion or cudaVersion == "" then + local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" + local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " + warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" + warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " + warn = warn .. "as expected. Export " .. suppress_var .. "=1" + local suppress_warn = os.getenv(suppress_var) + if not suppress_warn or suppress_warn == 1 then + LmodWarning(warn) + end + else + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if tonumber(major) < tonumber(major_req) then driver_libs_need_update = true + elseif tonumber(major) == tonumber(major_req) then + if tonumber(minor) < tonumber(minor_req) then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "You will therefore be in minor version compatibility mode as described in " + advice = advice .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" + LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", advice) + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then " + advice = advice .. "let EESSI know about the update.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end - end - if driver_libs_need_update == true then - local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "Please update your CUDA driver libraries and then " - advice = advice .. "let EESSI know about the update.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end end end From 288ce978e6b385237edb1c4fd28d4feb9175b2cf Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 8 May 2026 15:44:45 +0200 Subject: [PATCH 2/5] Only print the warning once --- create_lmodsitepackage.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index c9abe976..49624d6b 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -201,13 +201,13 @@ -- Account for the fact that the script sourced above was designed to never return a non-zero exit code, -- even if it fails to set EESSI_CUDA_DRIVER_VERSION -- Essentially, we handle that case here by raising an error, which can be suppressed + local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" + local suppress_warn = os.getenv(suppress_var) if not cudaVersion or cudaVersion == "" then - local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " warn = warn .. "as expected. Export " .. suppress_var .. "=1" - local suppress_warn = os.getenv(suppress_var) if not suppress_warn or suppress_warn == 1 then LmodWarning(warn) end @@ -220,14 +220,16 @@ driver_libs_need_update = true elseif tonumber(major) == tonumber(major_req) then if tonumber(minor) < tonumber(minor_req) then - local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "You will therefore be in minor version compatibility mode as described in " - advice = advice .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" - LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", advice) + local warn = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + warn = warn .. "You will therefore be in minor version compatibility mode as described in " + warn = warn .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" + if not suppress_warn or suppress_warn == 1 then + LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", warn) + end end end if driver_libs_need_update == true then - local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " advice = advice .. "Please update your CUDA driver libraries and then " advice = advice .. "let EESSI know about the update.\\n" advice = advice .. refer_to_docs From 4b8b0347eae3c96c3834e03554e17d3f8c4ca549 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 8 May 2026 15:51:22 +0200 Subject: [PATCH 3/5] Actually suppress the warning --- create_lmodsitepackage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 49624d6b..8bba0f1c 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -225,6 +225,7 @@ warn = warn .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" if not suppress_warn or suppress_warn == 1 then LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", warn) + setenv(suppress_var, "1") end end end From 0b0634521b3fdc4cb17698cba2cafed7b9a9abeb Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 8 May 2026 16:20:05 +0200 Subject: [PATCH 4/5] Allow that the warning appears in a clean environment --- create_lmodsitepackage.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 8bba0f1c..f0ca8e83 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -208,7 +208,7 @@ warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " warn = warn .. "as expected. Export " .. suppress_var .. "=1" - if not suppress_warn or suppress_warn == 1 then + if not suppress_warn or suppress_warn == "1" then LmodWarning(warn) end else @@ -223,9 +223,17 @@ local warn = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " warn = warn .. "You will therefore be in minor version compatibility mode as described in " warn = warn .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" - if not suppress_warn or suppress_warn == 1 then + if not suppress_warn or suppress_warn == "1" then LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", warn) - setenv(suppress_var, "1") + if not suppress_warn then + pushenv(suppress_var, myModuleName()) + end + end + if (mode() == "unload") then + if suppress_warn == myModuleName() then + -- make sure the variable eventually gets unset + pushenv(suppress_var, myModuleName()) + end end end end From 19cbceec6c7a98f82b81b229f00c1d32610f7fc7 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 8 May 2026 16:32:21 +0200 Subject: [PATCH 5/5] I don't know how to unset this cleanly --- create_lmodsitepackage.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index f0ca8e83..c2905d68 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -229,12 +229,6 @@ pushenv(suppress_var, myModuleName()) end end - if (mode() == "unload") then - if suppress_warn == myModuleName() then - -- make sure the variable eventually gets unset - pushenv(suppress_var, myModuleName()) - end - end end end if driver_libs_need_update == true then