aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp')
-rw-r--r--contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp662
1 files changed, 392 insertions, 270 deletions
diff --git a/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp b/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp
index 769eae14df51..61d12b10dfb6 100644
--- a/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -16,15 +16,17 @@
#include "clang/Driver/DriverDiagnostic.h"
#include "clang/Driver/InputInfo.h"
#include "clang/Driver/Options.h"
-#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Host.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
-#include "llvm/Support/TargetParser.h"
#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/TargetParser.h"
#include <system_error>
using namespace clang::driver;
@@ -34,25 +36,6 @@ using namespace clang;
using namespace llvm::opt;
namespace {
-struct CudaVersionInfo {
- std::string DetectedVersion;
- CudaVersion Version;
-};
-// Parses the contents of version.txt in an CUDA installation. It should
-// contain one line of the from e.g. "CUDA Version 7.5.2".
-CudaVersionInfo parseCudaVersionFile(llvm::StringRef V) {
- V = V.trim();
- if (!V.startswith("CUDA Version "))
- return {V.str(), CudaVersion::UNKNOWN};
- V = V.substr(strlen("CUDA Version "));
- SmallVector<StringRef,4> VersionParts;
- V.split(VersionParts, '.');
- return {"version.txt: " + V.str() + ".",
- VersionParts.size() < 2
- ? CudaVersion::UNKNOWN
- : CudaStringToVersion(
- join_items(".", VersionParts[0], VersionParts[1]))};
-}
CudaVersion getCudaVersion(uint32_t raw_version) {
if (raw_version < 7050)
@@ -77,15 +60,41 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
return CudaVersion::CUDA_110;
if (raw_version < 11020)
return CudaVersion::CUDA_111;
- return CudaVersion::LATEST;
+ if (raw_version < 11030)
+ return CudaVersion::CUDA_112;
+ if (raw_version < 11040)
+ return CudaVersion::CUDA_113;
+ if (raw_version < 11050)
+ return CudaVersion::CUDA_114;
+ if (raw_version < 11060)
+ return CudaVersion::CUDA_115;
+ if (raw_version < 11070)
+ return CudaVersion::CUDA_116;
+ if (raw_version < 11080)
+ return CudaVersion::CUDA_117;
+ if (raw_version < 11090)
+ return CudaVersion::CUDA_118;
+ if (raw_version < 12010)
+ return CudaVersion::CUDA_120;
+ if (raw_version < 12020)
+ return CudaVersion::CUDA_121;
+ if (raw_version < 12030)
+ return CudaVersion::CUDA_122;
+ if (raw_version < 12040)
+ return CudaVersion::CUDA_123;
+ if (raw_version < 12050)
+ return CudaVersion::CUDA_124;
+ if (raw_version < 12060)
+ return CudaVersion::CUDA_125;
+ return CudaVersion::NEW;
}
-CudaVersionInfo parseCudaHFile(llvm::StringRef Input) {
+CudaVersion parseCudaHFile(llvm::StringRef Input) {
// Helper lambda which skips the words if the line starts with them or returns
- // None otherwise.
+ // std::nullopt otherwise.
auto StartsWithWords =
[](llvm::StringRef Line,
- const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> {
+ const SmallVector<StringRef, 3> words) -> std::optional<StringRef> {
for (StringRef word : words) {
if (!Line.consume_front(word))
return {};
@@ -100,21 +109,27 @@ CudaVersionInfo parseCudaHFile(llvm::StringRef Input) {
StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) {
uint32_t RawVersion;
Line->consumeInteger(10, RawVersion);
- return {"cuda.h: CUDA_VERSION=" + Twine(RawVersion).str() + ".",
- getCudaVersion(RawVersion)};
+ return getCudaVersion(RawVersion);
}
// Find next non-empty line.
Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim();
}
- return {"cuda.h: CUDA_VERSION not found.", CudaVersion::UNKNOWN};
+ return CudaVersion::UNKNOWN;
}
} // namespace
void CudaInstallationDetector::WarnIfUnsupportedVersion() {
- if (DetectedVersionIsNotSupported)
- D.Diag(diag::warn_drv_unknown_cuda_version)
- << DetectedVersion
- << CudaVersionToString(CudaVersion::LATEST_SUPPORTED);
+ if (Version > CudaVersion::PARTIALLY_SUPPORTED) {
+ std::string VersionString = CudaVersionToString(Version);
+ if (!VersionString.empty())
+ VersionString.insert(0, " ");
+ D.Diag(diag::warn_drv_new_cuda_version)
+ << VersionString
+ << (CudaVersion::PARTIALLY_SUPPORTED != CudaVersion::FULLY_SUPPORTED)
+ << CudaVersionToString(CudaVersion::PARTIALLY_SUPPORTED);
+ } else if (Version > CudaVersion::FULLY_SUPPORTED)
+ D.Diag(diag::warn_drv_partially_supported_cuda_version)
+ << CudaVersionToString(Version);
}
CudaInstallationDetector::CudaInstallationDetector(
@@ -193,55 +208,28 @@ CudaInstallationDetector::CudaInstallationDetector(
if (CheckLibDevice && !FS.exists(LibDevicePath))
continue;
- // On Linux, we have both lib and lib64 directories, and we need to choose
- // based on our triple. On MacOS, we have only a lib directory.
- //
- // It's sufficient for our purposes to be flexible: If both lib and lib64
- // exist, we choose whichever one matches our triple. Otherwise, if only
- // lib exists, we use it.
- if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
- LibPath = InstallPath + "/lib64";
- else if (FS.exists(InstallPath + "/lib"))
- LibPath = InstallPath + "/lib";
- else
- continue;
-
- CudaVersionInfo VersionInfo = {"", CudaVersion::UNKNOWN};
- if (auto VersionFile = FS.getBufferForFile(InstallPath + "/version.txt"))
- VersionInfo = parseCudaVersionFile((*VersionFile)->getBuffer());
- // If version file didn't give us the version, try to find it in cuda.h
- if (VersionInfo.Version == CudaVersion::UNKNOWN)
- if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))
- VersionInfo = parseCudaHFile((*CudaHFile)->getBuffer());
- // As the last resort, make an educated guess between CUDA-7.0, (which had
- // no version.txt file and had old-style libdevice bitcode ) and an unknown
- // recent CUDA version (no version.txt, new style bitcode).
- if (VersionInfo.Version == CudaVersion::UNKNOWN) {
- VersionInfo.Version = (FS.exists(LibDevicePath + "/libdevice.10.bc"))
- ? Version = CudaVersion::LATEST
- : Version = CudaVersion::CUDA_70;
- VersionInfo.DetectedVersion =
- "No version found in version.txt or cuda.h.";
+ Version = CudaVersion::UNKNOWN;
+ if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))
+ Version = parseCudaHFile((*CudaHFile)->getBuffer());
+ // As the last resort, make an educated guess between CUDA-7.0, which had
+ // old-style libdevice bitcode, and an unknown recent CUDA version.
+ if (Version == CudaVersion::UNKNOWN) {
+ Version = FS.exists(LibDevicePath + "/libdevice.10.bc")
+ ? CudaVersion::NEW
+ : CudaVersion::CUDA_70;
}
- Version = VersionInfo.Version;
- DetectedVersion = VersionInfo.DetectedVersion;
-
- // TODO(tra): remove the warning once we have all features of 10.2
- // and 11.0 implemented.
- DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
-
if (Version >= CudaVersion::CUDA_90) {
// CUDA-9+ uses single libdevice file for all GPU variants.
std::string FilePath = LibDevicePath + "/libdevice.10.bc";
if (FS.exists(FilePath)) {
- for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E;
- ++Arch) {
- CudaArch GpuArch = static_cast<CudaArch>(Arch);
- if (!IsNVIDIAGpuArch(GpuArch))
+ for (int Arch = (int)OffloadArch::SM_30, E = (int)OffloadArch::LAST;
+ Arch < E; ++Arch) {
+ OffloadArch OA = static_cast<OffloadArch>(Arch);
+ if (!IsNVIDIAOffloadArch(OA))
continue;
- std::string GpuArchName(CudaArchToString(GpuArch));
- LibDeviceMap[GpuArchName] = FilePath;
+ std::string OffloadArchName(OffloadArchToString(OA));
+ LibDeviceMap[OffloadArchName] = FilePath;
}
}
} else {
@@ -254,7 +242,7 @@ CudaInstallationDetector::CudaInstallationDetector(
// Process all bitcode filenames that look like
// libdevice.compute_XX.YY.bc
const StringRef LibDeviceName = "libdevice.";
- if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
+ if (!(FileName.starts_with(LibDeviceName) && FileName.ends_with(".bc")))
continue;
StringRef GpuArch = FileName.slice(
LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
@@ -319,24 +307,22 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
return;
}
- CC1Args.push_back("-internal-isystem");
- CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
CC1Args.push_back("-include");
CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
}
void CudaInstallationDetector::CheckCudaVersionSupportsArch(
- CudaArch Arch) const {
- if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
+ OffloadArch Arch) const {
+ if (Arch == OffloadArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
ArchsWithBadVersion[(int)Arch])
return;
- auto MinVersion = MinVersionForCudaArch(Arch);
- auto MaxVersion = MaxVersionForCudaArch(Arch);
+ auto MinVersion = MinVersionForOffloadArch(Arch);
+ auto MaxVersion = MaxVersionForOffloadArch(Arch);
if (Version < MinVersion || Version > MaxVersion) {
ArchsWithBadVersion[(int)Arch] = true;
D.Diag(diag::err_drv_cuda_version_unsupported)
- << CudaArchToString(Arch) << CudaVersionToString(MinVersion)
+ << OffloadArchToString(Arch) << CudaVersionToString(MinVersion)
<< CudaVersionToString(MaxVersion) << InstallPath
<< CudaVersionToString(Version);
}
@@ -395,22 +381,28 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
const ArgList &Args,
const char *LinkingOutput) const {
const auto &TC =
- static_cast<const toolchains::CudaToolChain &>(getToolChain());
+ static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
assert(TC.getTriple().isNVPTX() && "Wrong platform");
StringRef GPUArchName;
- // If this is an OpenMP action we need to extract the device architecture
- // from the -march=arch option. This option may come from -Xopenmp-target
- // flag or the default value.
- if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
- GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
- assert(!GPUArchName.empty() && "Must have an architecture passed in.");
- } else
+ // If this is a CUDA action we need to extract the device architecture
+ // from the Job's associated architecture, otherwise use the -march=arch
+ // option. This option may come from -Xopenmp-target flag or the default
+ // value.
+ if (JA.isDeviceOffloading(Action::OFK_Cuda)) {
GPUArchName = JA.getOffloadingArch();
+ } else {
+ GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
+ if (GPUArchName.empty()) {
+ C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
+ << getToolChain().getArchName() << getShortName();
+ return;
+ }
+ }
// Obtain architecture from the action.
- CudaArch gpu_arch = StringToCudaArch(GPUArchName);
- assert(gpu_arch != CudaArch::UNKNOWN &&
+ OffloadArch gpu_arch = StringToOffloadArch(GPUArchName);
+ assert(gpu_arch != OffloadArch::UNKNOWN &&
"Device action expected to have an architecture.");
// Check that our installation's ptxas supports gpu_arch.
@@ -465,24 +457,33 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-v");
CmdArgs.push_back("--gpu-name");
- CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
+ CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));
CmdArgs.push_back("--output-file");
- CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output)));
- for (const auto& II : Inputs)
+ std::string OutputFileName = TC.getInputFilename(Output);
+
+ if (Output.isFilename() && OutputFileName != Output.getFilename())
+ C.addTempFile(Args.MakeArgString(OutputFileName));
+
+ CmdArgs.push_back(Args.MakeArgString(OutputFileName));
+ for (const auto &II : Inputs)
CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
- for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+ for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
CmdArgs.push_back(Args.MakeArgString(A));
- bool Relocatable = false;
+ bool Relocatable;
if (JA.isOffloading(Action::OFK_OpenMP))
// In OpenMP we need to generate relocatable code.
Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
options::OPT_fnoopenmp_relocatable_target,
/*Default=*/true);
else if (JA.isOffloading(Action::OFK_Cuda))
- Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
- options::OPT_fno_gpu_rdc, /*Default=*/false);
+ // In CUDA we generate relocatable code by default.
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ /*Default=*/false);
+ else
+ // Otherwise, we are compiling directly and should create linkable output.
+ Relocatable = true;
if (Relocatable)
CmdArgs.push_back("-c");
@@ -499,18 +500,20 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
Exec, CmdArgs, Inputs, Output));
}
-static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
- bool includePTX = true;
- for (Arg *A : Args) {
- if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
- A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
- continue;
+static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {
+ // The new driver does not include PTX by default to avoid overhead.
+ bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false);
+ for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,
+ options::OPT_no_cuda_include_ptx_EQ)) {
A->claim();
const StringRef ArchStr = A->getValue();
- if (ArchStr == "all" || ArchStr == gpu_arch) {
- includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
- continue;
- }
+ if (A->getOption().matches(options::OPT_cuda_include_ptx_EQ) &&
+ (ArchStr == "all" || ArchStr == InputArch))
+ includePTX = true;
+ else if (A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ) &&
+ (ArchStr == "all" || ArchStr == InputArch))
+ includePTX = false;
}
return includePTX;
}
@@ -518,11 +521,11 @@ static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
// All inputs to this linker must be from CudaDeviceActions, as we need to look
// at the Inputs' Actions in order to figure out which GPU architecture they
// correspond to.
-void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
- const InputInfo &Output,
- const InputInfoList &Inputs,
- const ArgList &Args,
- const char *LinkingOutput) const {
+void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args,
+ const char *LinkingOutput) const {
const auto &TC =
static_cast<const toolchains::CudaToolChain &>(getToolChain());
assert(TC.getTriple().isNVPTX() && "Wrong platform");
@@ -536,14 +539,14 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
CmdArgs.push_back("-g");
- for (const auto& II : Inputs) {
+ for (const auto &II : Inputs) {
auto *A = II.getAction();
assert(A->getInputs().size() == 1 &&
"Device offload action is expected to have a single input");
const char *gpu_arch_str = A->getOffloadingArch();
assert(gpu_arch_str &&
"Device action expected to have associated a GPU architecture!");
- CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+ OffloadArch gpu_arch = StringToOffloadArch(gpu_arch_str);
if (II.getType() == types::TY_PP_Asm &&
!shouldIncludePTX(Args, gpu_arch_str))
@@ -551,13 +554,14 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
// We need to pass an Arch of the form "sm_XX" for cubin files and
// "compute_XX" for ptx.
const char *Arch = (II.getType() == types::TY_PP_Asm)
- ? CudaArchToVirtualArchString(gpu_arch)
+ ? OffloadArchToVirtualArchString(gpu_arch)
: gpu_arch_str;
- CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
- Arch + ",file=" + II.getFilename()));
+ CmdArgs.push_back(
+ Args.MakeArgString(llvm::Twine("--image=profile=") + Arch +
+ ",file=" + getToolChain().getInputFilename(II)));
}
- for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+ for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
CmdArgs.push_back(Args.MakeArgString(A));
const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
@@ -568,114 +572,253 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
Exec, CmdArgs, Inputs, Output));
}
-void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
- const InputInfo &Output,
- const InputInfoList &Inputs,
- const ArgList &Args,
- const char *LinkingOutput) const {
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args,
+ const char *LinkingOutput) const {
const auto &TC =
- static_cast<const toolchains::CudaToolChain &>(getToolChain());
- assert(TC.getTriple().isNVPTX() && "Wrong platform");
-
+ static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
ArgStringList CmdArgs;
- // OpenMP uses nvlink to link cubin files. The result will be embedded in the
- // host binary by the host linker.
- assert(!JA.isHostOffloading(Action::OFK_OpenMP) &&
- "CUDA toolchain not expected for an OpenMP host device.");
+ assert(TC.getTriple().isNVPTX() && "Wrong platform");
+ assert((Output.isFilename() || Output.isNothing()) && "Invalid output.");
if (Output.isFilename()) {
CmdArgs.push_back("-o");
CmdArgs.push_back(Output.getFilename());
- } else
- assert(Output.isNothing() && "Invalid output.");
+ }
+
if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
CmdArgs.push_back("-g");
if (Args.hasArg(options::OPT_v))
CmdArgs.push_back("-v");
- StringRef GPUArch =
- Args.getLastArgValue(options::OPT_march_EQ);
- assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
+ StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
+ if (GPUArch.empty()) {
+ C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
+ << getToolChain().getArchName() << getShortName();
+ return;
+ }
CmdArgs.push_back("-arch");
CmdArgs.push_back(Args.MakeArgString(GPUArch));
+ if (Args.hasArg(options::OPT_ptxas_path_EQ))
+ CmdArgs.push_back(Args.MakeArgString(
+ "--pxtas-path=" + Args.getLastArgValue(options::OPT_ptxas_path_EQ)));
+
+ if (Args.hasArg(options::OPT_cuda_path_EQ))
+ CmdArgs.push_back(Args.MakeArgString(
+ "--cuda-path=" + Args.getLastArgValue(options::OPT_cuda_path_EQ)));
+
// Add paths specified in LIBRARY_PATH environment variable as -L options.
addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
+ // Add standard library search paths passed on the command line.
+ Args.AddAllArgs(CmdArgs, options::OPT_L);
+ getToolChain().AddFilePathLibArgs(Args, CmdArgs);
+ AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
+
+ if (C.getDriver().isUsingLTO())
+ addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
+ C.getDriver().getLTOMode() == LTOK_Thin);
+
// Add paths for the default clang library path.
SmallString<256> DefaultLibPath =
llvm::sys::path::parent_path(TC.getDriver().Dir);
- llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
+ llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
- for (const auto &II : Inputs) {
- if (II.getType() == types::TY_LLVM_IR ||
- II.getType() == types::TY_LTO_IR ||
- II.getType() == types::TY_LTO_BC ||
- II.getType() == types::TY_LLVM_BC) {
- C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
- << getToolChain().getTripleString();
- continue;
- }
-
- // Currently, we only pass the input files to the linker, we do not pass
- // any libraries that may be valid only for the host.
- if (!II.isFilename())
- continue;
-
- const char *CubinF = C.addTempFile(
- C.getArgs().MakeArgString(getToolChain().getInputFilename(II)));
-
- CmdArgs.push_back(CubinF);
- }
-
- const char *Exec =
- Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
C.addCommand(std::make_unique<Command>(
JA, *this,
ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
"--options-file"},
- Exec, CmdArgs, Inputs, Output));
+ Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")),
+ CmdArgs, Inputs, Output));
}
-/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
-/// which isn't properly a linker but nonetheless performs the step of stitching
-/// together object files from the assembler into a single blob.
+void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+ const llvm::opt::ArgList &Args,
+ std::vector<StringRef> &Features) {
+ if (Args.hasArg(options::OPT_cuda_feature_EQ)) {
+ StringRef PtxFeature =
+ Args.getLastArgValue(options::OPT_cuda_feature_EQ, "+ptx42");
+ Features.push_back(Args.MakeArgString(PtxFeature));
+ return;
+ }
+ CudaInstallationDetector CudaInstallation(D, Triple, Args);
-CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
- const ToolChain &HostTC, const ArgList &Args,
- const Action::OffloadKind OK)
- : ToolChain(D, Triple, Args), HostTC(HostTC),
- CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
- if (CudaInstallation.isValid()) {
- CudaInstallation.WarnIfUnsupportedVersion();
- getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
+ // New CUDA versions often introduce new instructions that are only supported
+ // by new PTX version, so we need to raise PTX level to enable them in NVPTX
+ // back-end.
+ const char *PtxFeature = nullptr;
+ switch (CudaInstallation.version()) {
+#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \
+ case CudaVersion::CUDA_##CUDA_VER: \
+ PtxFeature = "+ptx" #PTX_VER; \
+ break;
+ CASE_CUDA_VERSION(125, 85);
+ CASE_CUDA_VERSION(124, 84);
+ CASE_CUDA_VERSION(123, 83);
+ CASE_CUDA_VERSION(122, 82);
+ CASE_CUDA_VERSION(121, 81);
+ CASE_CUDA_VERSION(120, 80);
+ CASE_CUDA_VERSION(118, 78);
+ CASE_CUDA_VERSION(117, 77);
+ CASE_CUDA_VERSION(116, 76);
+ CASE_CUDA_VERSION(115, 75);
+ CASE_CUDA_VERSION(114, 74);
+ CASE_CUDA_VERSION(113, 73);
+ CASE_CUDA_VERSION(112, 72);
+ CASE_CUDA_VERSION(111, 71);
+ CASE_CUDA_VERSION(110, 70);
+ CASE_CUDA_VERSION(102, 65);
+ CASE_CUDA_VERSION(101, 64);
+ CASE_CUDA_VERSION(100, 63);
+ CASE_CUDA_VERSION(92, 61);
+ CASE_CUDA_VERSION(91, 61);
+ CASE_CUDA_VERSION(90, 60);
+#undef CASE_CUDA_VERSION
+ default:
+ PtxFeature = "+ptx42";
}
+ Features.push_back(PtxFeature);
+}
+
+/// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This
+/// operates as a stand-alone version of the NVPTX tools without the host
+/// toolchain.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+ const llvm::Triple &HostTriple,
+ const ArgList &Args, bool Freestanding = false)
+ : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args),
+ Freestanding(Freestanding) {
+ if (CudaInstallation.isValid())
+ getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
// Lookup binaries into the driver directory, this is used to
- // discover the clang-offload-bundler executable.
+ // discover the 'nvptx-arch' executable.
getProgramPaths().push_back(getDriver().Dir);
}
-std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
- // Only object files are changed, for example assembly files keep their .s
- // extensions. CUDA also continues to use .o as they don't use nvlink but
- // fatbinary.
- if (!(OK == Action::OFK_OpenMP && Input.getType() == types::TY_Object))
- return ToolChain::getInputFilename(Input);
+/// We only need the host triple to locate the CUDA binary utilities, use the
+/// system's default triple if not provided.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+ const ArgList &Args)
+ : NVPTXToolChain(D, Triple, llvm::Triple(LLVM_HOST_TRIPLE), Args,
+ /*Freestanding=*/true) {}
+
+llvm::opt::DerivedArgList *
+NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+ StringRef BoundArch,
+ Action::OffloadKind OffloadKind) const {
+ DerivedArgList *DAL = ToolChain::TranslateArgs(Args, BoundArch, OffloadKind);
+ if (!DAL)
+ DAL = new DerivedArgList(Args.getBaseArgs());
+
+ const OptTable &Opts = getDriver().getOpts();
+
+ for (Arg *A : Args)
+ if (!llvm::is_contained(*DAL, A))
+ DAL->append(A);
+
+ if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+ OffloadArchToString(OffloadArch::CudaDefault));
+ } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "generic" &&
+ OffloadKind == Action::OFK_None) {
+ DAL->eraseArg(options::OPT_march_EQ);
+ } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {
+ auto GPUsOrErr = getSystemGPUArchs(Args);
+ if (!GPUsOrErr) {
+ getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+ << getArchName() << llvm::toString(GPUsOrErr.takeError()) << "-march";
+ } else {
+ if (GPUsOrErr->size() > 1)
+ getDriver().Diag(diag::warn_drv_multi_gpu_arch)
+ << getArchName() << llvm::join(*GPUsOrErr, ", ") << "-march";
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+ Args.MakeArgString(GPUsOrErr->front()));
+ }
+ }
- // Replace extension for object files with cubin because nvlink relies on
- // these particular file names.
- SmallString<256> Filename(ToolChain::getInputFilename(Input));
- llvm::sys::path::replace_extension(Filename, "cubin");
- return std::string(Filename.str());
+ return DAL;
+}
+
+void NVPTXToolChain::addClangTargetOptions(
+ const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+ Action::OffloadKind DeviceOffloadingKind) const {
+ // If we are compiling with a standalone NVPTX toolchain we want to try to
+ // mimic a standard environment as much as possible. So we enable lowering
+ // ctor / dtor functions to global symbols that can be registered.
+ if (Freestanding)
+ CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"});
+}
+
+bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
+ const Option &O = A->getOption();
+ return (O.matches(options::OPT_gN_Group) &&
+ !O.matches(options::OPT_gmodules)) ||
+ O.matches(options::OPT_g_Flag) ||
+ O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
+ O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
+ O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
+ O.matches(options::OPT_gdwarf_5) ||
+ O.matches(options::OPT_gcolumn_info);
+}
+
+void NVPTXToolChain::adjustDebugInfoKind(
+ llvm::codegenoptions::DebugInfoKind &DebugInfoKind,
+ const ArgList &Args) const {
+ switch (mustEmitDebugInfo(Args)) {
+ case DisableDebugInfo:
+ DebugInfoKind = llvm::codegenoptions::NoDebugInfo;
+ break;
+ case DebugDirectivesOnly:
+ DebugInfoKind = llvm::codegenoptions::DebugDirectivesOnly;
+ break;
+ case EmitSameDebugInfoAsHost:
+ // Use same debug info level as the host.
+ break;
+ }
}
+Expected<SmallVector<std::string>>
+NVPTXToolChain::getSystemGPUArchs(const ArgList &Args) const {
+ // Detect NVIDIA GPUs availible on the system.
+ std::string Program;
+ if (Arg *A = Args.getLastArg(options::OPT_nvptx_arch_tool_EQ))
+ Program = A->getValue();
+ else
+ Program = GetProgramPath("nvptx-arch");
+
+ auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);
+ if (!StdoutOrErr)
+ return StdoutOrErr.takeError();
+
+ SmallVector<std::string, 1> GPUArchs;
+ for (StringRef Arch : llvm::split((*StdoutOrErr)->getBuffer(), "\n"))
+ if (!Arch.empty())
+ GPUArchs.push_back(Arch.str());
+
+ if (GPUArchs.empty())
+ return llvm::createStringError(std::error_code(),
+ "No NVIDIA GPU detected in the system");
+
+ return std::move(GPUArchs);
+}
+
+/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
+
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+ const ToolChain &HostTC, const ArgList &Args)
+ : NVPTXToolChain(D, Triple, HostTC.getTriple(), Args), HostTC(HostTC) {}
+
void CudaToolChain::addClangTargetOptions(
- const llvm::opt::ArgList &DriverArgs,
- llvm::opt::ArgStringList &CC1Args,
+ const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadingKind) const {
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
@@ -686,11 +829,15 @@ void CudaToolChain::addClangTargetOptions(
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
if (DeviceOffloadingKind == Action::OFK_Cuda) {
- CC1Args.push_back("-fcuda-is-device");
-
- if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
- options::OPT_fno_cuda_approx_transcendentals, false))
- CC1Args.push_back("-fcuda-approx-transcendentals");
+ CC1Args.append(
+ {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
+
+ // Unsized function arguments used for variadics were introduced in CUDA-9.0
+ // We still do not support generating code that actually uses variadic
+ // arguments yet, but we do need to allow parsing them as recent CUDA
+ // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
+ if (CudaInstallation.version() >= CudaVersion::CUDA_90)
+ CC1Args.push_back("-fcuda-allow-variadic-functions");
}
if (DriverArgs.hasArg(options::OPT_nogpulib))
@@ -711,29 +858,6 @@ void CudaToolChain::addClangTargetOptions(
clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
- // New CUDA versions often introduce new instructions that are only supported
- // by new PTX version, so we need to raise PTX level to enable them in NVPTX
- // back-end.
- const char *PtxFeature = nullptr;
- switch (CudaInstallationVersion) {
-#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \
- case CudaVersion::CUDA_##CUDA_VER: \
- PtxFeature = "+ptx" #PTX_VER; \
- break;
- CASE_CUDA_VERSION(112, 72);
- CASE_CUDA_VERSION(111, 71);
- CASE_CUDA_VERSION(110, 70);
- CASE_CUDA_VERSION(102, 65);
- CASE_CUDA_VERSION(101, 64);
- CASE_CUDA_VERSION(100, 63);
- CASE_CUDA_VERSION(92, 61);
- CASE_CUDA_VERSION(91, 61);
- CASE_CUDA_VERSION(90, 60);
-#undef CASE_CUDA_VERSION
- default:
- PtxFeature = "+ptx42";
- }
- CC1Args.append({"-target-feature", PtxFeature});
if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
options::OPT_fno_cuda_short_ptr, false))
CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
@@ -751,15 +875,12 @@ void CudaToolChain::addClangTargetOptions(
return;
}
- std::string BitcodeSuffix;
- if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
- options::OPT_fno_openmp_target_new_runtime, false))
- BitcodeSuffix = "new-nvptx-" + GpuArch.str();
- else
- BitcodeSuffix = "nvptx-" + GpuArch.str();
+ // Link the bitcode library late if we're using device LTO.
+ if (getDriver().isUsingLTO(/* IsOffload */ true))
+ return;
- addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
- getTriple());
+ addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
+ getTriple(), HostTC);
}
}
@@ -777,33 +898,6 @@ llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
return llvm::DenormalMode::getIEEE();
}
-bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
- const Option &O = A->getOption();
- return (O.matches(options::OPT_gN_Group) &&
- !O.matches(options::OPT_gmodules)) ||
- O.matches(options::OPT_g_Flag) ||
- O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
- O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
- O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
- O.matches(options::OPT_gdwarf_5) ||
- O.matches(options::OPT_gcolumn_info);
-}
-
-void CudaToolChain::adjustDebugInfoKind(
- codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
- switch (mustEmitDebugInfo(Args)) {
- case DisableDebugInfo:
- DebugInfoKind = codegenoptions::NoDebugInfo;
- break;
- case DebugDirectivesOnly:
- DebugInfoKind = codegenoptions::DebugDirectivesOnly;
- break;
- case EmitSameDebugInfoAsHost:
- // Use same debug info level as the host.
- break;
- }
-}
-
void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
ArgStringList &CC1Args) const {
// Check our CUDA version if we're going to include the CUDA headers.
@@ -811,11 +905,20 @@ void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
!DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
assert(!Arch.empty() && "Must have an explicit GPU arch.");
- CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
+ CudaInstallation.CheckCudaVersionSupportsArch(StringToOffloadArch(Arch));
}
CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
}
+std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
+ // Only object files are changed, for example assembly files keep their .s
+ // extensions. If the user requested device-only compilation don't change it.
+ if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
+ return ToolChain::getInputFilename(Input);
+
+ return ToolChain::getInputFilename(Input);
+}
+
llvm::opt::DerivedArgList *
CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
StringRef BoundArch,
@@ -831,45 +934,59 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
// flags are not duplicated.
// Also append the compute capability.
if (DeviceOffloadKind == Action::OFK_OpenMP) {
- for (Arg *A : Args) {
- bool IsDuplicate = false;
- for (Arg *DALArg : *DAL) {
- if (A == DALArg) {
- IsDuplicate = true;
- break;
+ for (Arg *A : Args)
+ if (!llvm::is_contained(*DAL, A))
+ DAL->append(A);
+
+ if (!DAL->hasArg(options::OPT_march_EQ)) {
+ StringRef Arch = BoundArch;
+ if (Arch.empty()) {
+ auto ArchsOrErr = getSystemGPUArchs(Args);
+ if (!ArchsOrErr) {
+ std::string ErrMsg =
+ llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));
+ getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+ << llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";
+ Arch = OffloadArchToString(OffloadArch::CudaDefault);
+ } else {
+ Arch = Args.MakeArgString(ArchsOrErr->front());
}
}
- if (!IsDuplicate)
- DAL->append(A);
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
}
- StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
- if (Arch.empty())
- DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
- CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
-
return DAL;
}
for (Arg *A : Args) {
- DAL->append(A);
+ // Make sure flags are not duplicated.
+ if (!llvm::is_contained(*DAL, A)) {
+ DAL->append(A);
+ }
}
if (!BoundArch.empty()) {
DAL->eraseArg(options::OPT_march_EQ);
- DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+ BoundArch);
}
return DAL;
}
+Tool *NVPTXToolChain::buildAssembler() const {
+ return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *NVPTXToolChain::buildLinker() const {
+ return new tools::NVPTX::Linker(*this);
+}
+
Tool *CudaToolChain::buildAssembler() const {
return new tools::NVPTX::Assembler(*this);
}
Tool *CudaToolChain::buildLinker() const {
- if (OK == Action::OFK_OpenMP)
- return new tools::NVPTX::OpenMPLinker(*this);
- return new tools::NVPTX::Linker(*this);
+ return new tools::NVPTX::FatBinary(*this);
}
void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
@@ -884,6 +1001,11 @@ CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
ArgStringList &CC1Args) const {
HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
+
+ if (!DriverArgs.hasArg(options::OPT_nogpuinc) && CudaInstallation.isValid())
+ CC1Args.append(
+ {"-internal-isystem",
+ DriverArgs.MakeArgString(CudaInstallation.getIncludePath())});
}
void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,