-
Notifications
You must be signed in to change notification settings - Fork 19
Allow for CUDA driver minor version compatibility #226
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ocaisa
wants to merge
5
commits into
EESSI:main
Choose a base branch
from
ocaisa:cuda_error_to_warning
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+70
−59
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
1fad551
Allow for CUDA driver minor version compatibility
ocaisa 288ce97
Only print the warning once
ocaisa 4b8b034
Actually suppress the warning
ocaisa 0b06345
Allow that the warning appears in a clean environment
ocaisa 19cbcee
I don't know how to unset this cleanly
ocaisa File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -161,72 +161,83 @@ | |
| local checkGpu = mt:haveProperty(simpleName,"arch","gpu") | ||
| local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK") | ||
| if checkGpu and (overrideGpuCheck == nil) then | ||
| local eessi_version = os.getenv('EESSI_VERSION') or "" | ||
| local eessi_eprefix = os.getenv("EESSI_EPREFIX") or "" | ||
| local eessi_version = os.getenv('EESSI_VERSION') | ||
| local eessi_eprefix = os.getenv("EESSI_EPREFIX") | ||
| if eessi_eprefix == nil or eessi_version == nil then | ||
| LmodError("EESSI_VERSION and EESSI_EPREFIX must be defined for GPU driver check to work\\n") | ||
| end | ||
| local cudaDriverDir = nil | ||
| if eessi_version == "2023.06" then | ||
| cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib" | ||
| else | ||
| cudaDriverDir = eessi_eprefix .. "/lib/nvidia" | ||
| end | ||
| local cudaDriverFile = cudaDriverDir .. "/libcuda.so" | ||
| local cudaDriverExists = isFile(cudaDriverFile) | ||
| local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") | ||
| if not (cudaDriverExists or singularityCudaExists) then | ||
| local advice = "which relies on the CUDA runtime environment and driver libraries. " | ||
| advice = advice .. "In order to be able to use the module, you will need " | ||
| advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. " | ||
| advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n" | ||
| advice = advice .. "You can override this check by setting the environment variable " | ||
| advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but " | ||
| advice = advice .. "the loaded application will not be able to execute on your system.\\n" | ||
| advice = advice .. refer_to_docs | ||
| LmodError("\\nYou requested to load ", simpleName, " ", advice) | ||
| else | ||
| -- CUDA driver exists, now we check its version to see if an update is needed | ||
| if cudaDriverExists then | ||
| local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") | ||
| if not cudaVersion or cudaVersion == "" then | ||
| local eessi_prefix = os.getenv("EESSI_PREFIX") | ||
| local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') | ||
| source_sh("bash", script) | ||
| end | ||
| cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") | ||
| local cudaVersion_req = os.getenv("EESSICUDAVERSION") | ||
| -- Account for the fact that the script sourced above was designed to never return a non-zero exit code, | ||
| -- even if it fails to set EESSI_CUDA_DRIVER_VERSION | ||
| -- Essentially, we handle that case here by raising an error, which can be suppressed | ||
| if not cudaVersion or cudaVersion == "" then | ||
| -- Having EESSICUDAVERSION set means we have an NVIDIA accelerator | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using this for forward thinking for when we have to deal with something similar for AMD |
||
| local cudaVersion_req = os.getenv("EESSICUDAVERSION") | ||
| if cudaVersion_req then | ||
| local cudaDriverDir = nil | ||
| if eessi_version == "2023.06" then | ||
| cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib" | ||
| else | ||
| cudaDriverDir = eessi_eprefix .. "/lib/nvidia" | ||
| end | ||
| local cudaDriverFile = cudaDriverDir .. "/libcuda.so" | ||
| local cudaDriverExists = isFile(cudaDriverFile) | ||
| local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") | ||
| if not (cudaDriverExists or singularityCudaExists) then | ||
| local advice = "which relies on the CUDA runtime environment and driver libraries. " | ||
| advice = advice .. "In order to be able to use the module, you will need " | ||
| advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. " | ||
| advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n" | ||
| advice = advice .. "You can override this check by setting the environment variable " | ||
| advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but " | ||
| advice = advice .. "the loaded application will not be able to execute on your system.\\n" | ||
| advice = advice .. refer_to_docs | ||
| LmodError("\\nYou requested to load ", simpleName, " ", advice) | ||
| else | ||
| -- CUDA driver exists, now we check its version to see if an update is needed | ||
| if cudaDriverExists then | ||
| local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") | ||
| if not cudaVersion or cudaVersion == "" then | ||
| local eessi_prefix = os.getenv("EESSI_PREFIX") | ||
| local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh') | ||
| source_sh("bash", script) | ||
| end | ||
| cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION") | ||
| -- Account for the fact that the script sourced above was designed to never return a non-zero exit code, | ||
| -- even if it fails to set EESSI_CUDA_DRIVER_VERSION | ||
| -- Essentially, we handle that case here by raising an error, which can be suppressed | ||
| local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING" | ||
| local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " | ||
| warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" | ||
| warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " | ||
| warn = warn .. "as expected. Export " .. suppress_var .. "=1" | ||
| local suppress_warn = os.getenv(suppress_var) | ||
| if not suppress_warn or suppress_warn == 1 then | ||
| LmodWarning(warn) | ||
| end | ||
| else | ||
| -- driver CUDA versions don't give a patch version for CUDA | ||
| local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") | ||
| local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") | ||
| local driver_libs_need_update = false | ||
| if tonumber(major) < tonumber(major_req) then | ||
| driver_libs_need_update = true | ||
| elseif tonumber(major) == tonumber(major_req) then | ||
| if tonumber(minor) < tonumber(minor_req) then | ||
| if not cudaVersion or cudaVersion == "" then | ||
| local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. " | ||
| warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '" | ||
| warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function " | ||
| warn = warn .. "as expected. Export " .. suppress_var .. "=1" | ||
| if not suppress_warn or suppress_warn == "1" then | ||
| LmodWarning(warn) | ||
| end | ||
| else | ||
| -- driver CUDA versions don't give a patch version for CUDA | ||
| local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") | ||
| local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") | ||
| local driver_libs_need_update = false | ||
| if tonumber(major) < tonumber(major_req) then | ||
| driver_libs_need_update = true | ||
| elseif tonumber(major) == tonumber(major_req) then | ||
| if tonumber(minor) < tonumber(minor_req) then | ||
| local warn = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " | ||
| warn = warn .. "You will therefore be in minor version compatibility mode as described in " | ||
| warn = warn .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n" | ||
| if not suppress_warn or suppress_warn == "1" then | ||
| LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", warn) | ||
| if not suppress_warn then | ||
| pushenv(suppress_var, myModuleName()) | ||
| end | ||
| end | ||
| end | ||
| end | ||
| if driver_libs_need_update == true then | ||
| local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " | ||
| advice = advice .. "Please update your CUDA driver libraries and then " | ||
| advice = advice .. "let EESSI know about the update.\\n" | ||
| advice = advice .. refer_to_docs | ||
| LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) | ||
| end | ||
| end | ||
| if driver_libs_need_update == true then | ||
| local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " | ||
| advice = advice .. "Please update your CUDA driver libraries and then " | ||
| advice = advice .. "let EESSI know about the update.\\n" | ||
| advice = advice .. refer_to_docs | ||
| LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) | ||
| end | ||
| end | ||
| end | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test below checks for
nil