aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp')
-rw-r--r--contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp328
1 files changed, 206 insertions, 122 deletions
diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
index 88030fee501b..5b43272bfa62 100644
--- a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
@@ -19,11 +19,13 @@
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/Support/Format.h"
+#include "llvm/Support/VirtualFileSystem.h"
using namespace clang;
using namespace CodeGen;
@@ -37,7 +39,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
- llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
+ llvm::PointerType *PtrTy;
/// Convenience reference to LLVM Context
llvm::LLVMContext &Context;
@@ -49,10 +51,10 @@ private:
const Decl *D;
};
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
- // Map a device stub function to a symbol for identifying kernel in host code.
+ // Map a kernel mangled name to a symbol for identifying kernel in host code
// For CUDA, the symbol for identifying the kernel is the same as the device
// stub function. For HIP, they are different.
- llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+ llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
// Map a kernel handle to the kernel stub.
llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
struct VarInfo {
@@ -69,6 +71,8 @@ private:
bool RelocatableDeviceCode;
/// Mangle context for device.
std::unique_ptr<MangleContext> DeviceMC;
+ /// Some zeros used for GEPs.
+ llvm::Constant *Zeros[2];
llvm::FunctionCallee getSetupArgumentFn() const;
llvm::FunctionCallee getLaunchFn() const;
@@ -86,14 +90,25 @@ private:
/// the start of the string. The result of this function can be used anywhere
/// where the C code specifies const char*.
llvm::Constant *makeConstantString(const std::string &Str,
- const std::string &Name = "",
- const std::string &SectionName = "",
- unsigned Alignment = 0) {
- llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
- llvm::ConstantInt::get(SizeTy, 0)};
+ const std::string &Name = "") {
auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
- llvm::GlobalVariable *GV =
- cast<llvm::GlobalVariable>(ConstStr.getPointer());
+ return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
+ ConstStr.getPointer(), Zeros);
+ }
+
+ /// Helper function which generates an initialized constant array from Str,
+ /// and optionally sets section name and alignment. AddNull specifies whether
+ /// the array should nave NUL termination.
+ llvm::Constant *makeConstantArray(StringRef Str,
+ StringRef Name = "",
+ StringRef SectionName = "",
+ unsigned Alignment = 0,
+ bool AddNull = false) {
+ llvm::Constant *Value =
+ llvm::ConstantDataArray::getString(Context, Str, AddNull);
+ auto *GV = new llvm::GlobalVariable(
+ TheModule, Value->getType(), /*isConstant=*/true,
+ llvm::GlobalValue::PrivateLinkage, Value, Name);
if (!SectionName.empty()) {
GV->setSection(SectionName);
// Mark the address as used which make sure that this section isn't
@@ -102,9 +117,7 @@ private:
}
if (Alignment)
GV->setAlignment(llvm::Align(Alignment));
-
- return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
- ConstStr.getPointer(), Zeros);
+ return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
}
/// Helper function that generates an empty dummy function returning void.
@@ -157,6 +170,8 @@ private:
llvm::Function *makeModuleDtorFunction();
/// Transform managed variables for device compilation.
void transformManagedVars();
+ /// Create offloading entries to register globals in RDC mode.
+ void createOffloadingEntries();
public:
CGNVCUDARuntime(CodeGenModule &CGM);
@@ -177,7 +192,7 @@ public:
llvm::Function *finalizeModule() override;
};
-}
+} // end anonymous namespace
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
if (CGM.getLangOpts().HIP)
@@ -212,21 +227,17 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
TheModule(CGM.getModule()),
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
DeviceMC(InitDeviceMC(CGM)) {
- CodeGen::CodeGenTypes &Types = CGM.getTypes();
- ASTContext &Ctx = CGM.getContext();
-
IntTy = CGM.IntTy;
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
-
- CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
- VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
- VoidPtrPtrTy = VoidPtrTy->getPointerTo();
+ Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
+ Zeros[1] = Zeros[0];
+ PtrTy = CGM.UnqualPtrTy;
}
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
- llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
+ llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
return CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, Params, false),
addPrefixToName("SetupArgument"));
@@ -236,27 +247,24 @@ llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
if (CGM.getLangOpts().HIP) {
// hipError_t hipLaunchByPtr(char *);
return CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
- } else {
- // cudaError_t cudaLaunch(char *);
- return CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
+ llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
}
+ // cudaError_t cudaLaunch(char *);
+ return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
+ "cudaLaunch");
}
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
- return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
+ return llvm::FunctionType::get(VoidTy, PtrTy, false);
}
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
- return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+ return llvm::FunctionType::get(VoidTy, PtrTy, false);
}
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
- auto CallbackFnTy = getCallbackFnTy();
- auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
- llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
- VoidPtrTy, CallbackFnTy->getPointerTo()};
+ llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
+ llvm::PointerType::getUnqual(Context)};
return llvm::FunctionType::get(VoidTy, Params, false);
}
@@ -282,13 +290,12 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
DeviceSideName = std::string(ND->getIdentifier()->getName());
// Make unique name for device side static file-scope variable for HIP.
- if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
- CGM.getLangOpts().GPURelocatableDeviceCode &&
- !CGM.getLangOpts().CUID.empty()) {
+ if (CGM.getContext().shouldExternalize(ND) &&
+ CGM.getLangOpts().GPURelocatableDeviceCode) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << DeviceSideName;
- CGM.printPostfixForExternalizedStaticVar(Out);
+ CGM.printPostfixForExternalizedDecl(Out, ND);
DeviceSideName = std::string(Out.str());
}
return DeviceSideName;
@@ -297,7 +304,8 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
- if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+ if (auto *GV =
+ dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
GV->setLinkage(CGF.CurFn->getLinkage());
GV->setInitializer(CGF.CurFn);
}
@@ -319,29 +327,40 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
Address KernelArgs = CGF.CreateTempAlloca(
- VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+ PtrTy, CharUnits::fromQuantity(16), "kernel_args",
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
// Store pointers to the arguments in a locally allocated launch_args.
for (unsigned i = 0; i < Args.size(); ++i) {
llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
- llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
+ llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
CGF.Builder.CreateDefaultAlignedStore(
VoidVarPtr,
- CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
+ CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i));
}
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
// Lookup cudaLaunchKernel/hipLaunchKernel function.
+ // HIP kernel launching API name depends on -fgpu-default-stream option. For
+ // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+ // it is hipLaunchKernel_spt.
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
// void **args, size_t sharedMem,
// cudaStream_t stream);
- // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
- // void **args, size_t sharedMem,
- // hipStream_t stream);
+ // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+ // dim3 blockDim, void **args,
+ // size_t sharedMem, hipStream_t stream);
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
- auto LaunchKernelName = addPrefixToName("LaunchKernel");
+ std::string KernelLaunchAPI = "LaunchKernel";
+ if (CGF.getLangOpts().GPUDefaultStream ==
+ LangOptions::GPUDefaultStreamKind::PerThread) {
+ if (CGF.getLangOpts().HIP)
+ KernelLaunchAPI = KernelLaunchAPI + "_spt";
+ else if (CGF.getLangOpts().CUDA)
+ KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
+ }
+ auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
IdentifierInfo &cudaLaunchKernelII =
CGM.getContext().Idents.get(LaunchKernelName);
FunctionDecl *cudaLaunchKernelFD = nullptr;
@@ -364,8 +383,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
Address ShmemSize =
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
- Address Stream =
- CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
+ Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy,
{/*gridDim=*/GridDim.getType(),
@@ -381,7 +399,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
// Emit the call to cudaLaunch
llvm::Value *Kernel =
- CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
+ CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
CallArgList LaunchKernelArgs;
LaunchKernelArgs.add(RValue::get(Kernel),
cudaLaunchKernelFD->getParamDecl(0)->getType());
@@ -397,7 +415,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
QualType QT = cudaLaunchKernelFD->getType();
QualType CQT = QT.getCanonicalType();
llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
- llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
+ llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
const CGFunctionInfo &FI =
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
@@ -421,7 +439,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
Offset = Offset.alignTo(TInfo.Align);
llvm::Value *Args[] = {
CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
- VoidPtrTy),
+ PtrTy),
llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
};
@@ -437,7 +455,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
// Emit the call to cudaLaunch
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
llvm::Value *Arg =
- CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
+ CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
CGF.EmitBranch(EndBlock);
@@ -473,7 +491,7 @@ static void replaceManagedVar(llvm::GlobalVariable *Var,
// variable with instructions.
for (auto &&Op : WorkItem) {
auto *CE = cast<llvm::ConstantExpr>(Op);
- auto *NewInst = llvm::createReplacementInstr(CE, I);
+ auto *NewInst = CE->getAsInstruction(I);
NewInst->replaceUsesOfWith(OldV, NewV);
OldV = CE;
NewV = NewInst;
@@ -515,8 +533,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
// int, uint3*, uint3*, dim3*, dim3*, int*)
llvm::Type *RegisterFuncParams[] = {
- VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
- VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
+ PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
+ PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
addUnderscoredPrefixToName("RegisterFunction"));
@@ -528,10 +546,10 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
for (auto &&I : EmittedKernels) {
llvm::Constant *KernelName =
makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
- llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
+ llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
- Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
+ KernelHandles[I.Kernel->getName()],
KernelName,
KernelName,
llvm::ConstantInt::get(IntTy, -1),
@@ -539,7 +557,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
NullPtr,
NullPtr,
NullPtr,
- llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
+ llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
Builder.CreateCall(RegisterFunc, Args);
}
@@ -551,16 +569,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
// void __cudaRegisterVar(void **, char *, char *, const char *,
// int, int, int, int)
- llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
- CharPtrTy, IntTy, VarSizeTy,
- IntTy, IntTy};
+ llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
+ IntTy, VarSizeTy, IntTy, IntTy};
llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
addUnderscoredPrefixToName("RegisterVar"));
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
// size_t, unsigned)
- llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
- CharPtrTy, VarSizeTy, IntTy};
+ llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
+ PtrTy, VarSizeTy, IntTy};
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
addUnderscoredPrefixToName("RegisterManagedVar"));
@@ -568,16 +585,13 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
// const void **, const char *, int, int);
llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(
- VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
- false),
+ VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
addUnderscoredPrefixToName("RegisterSurface"));
// void __cudaRegisterTexture(void **, const struct textureReference *,
// const void **, const char *, int, int, int)
llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(
- VoidTy,
- {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
- false),
+ VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
addUnderscoredPrefixToName("RegisterTexture"));
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
@@ -590,7 +604,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
uint64_t VarSize =
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
if (Info.Flags.isManaged()) {
- auto ManagedVar = new llvm::GlobalVariable(
+ auto *ManagedVar = new llvm::GlobalVariable(
CGM.getModule(), Var->getType(),
/*isConstant=*/false, Var->getLinkage(),
/*Init=*/Var->isDeclaration()
@@ -606,8 +620,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
replaceManagedVar(Var, ManagedVar);
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
- Builder.CreateBitCast(ManagedVar, VoidPtrTy),
- Builder.CreateBitCast(Var, VoidPtrTy),
+ ManagedVar,
+ Var,
VarName,
llvm::ConstantInt::get(VarSizeTy, VarSize),
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
@@ -616,7 +630,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
} else {
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
- Builder.CreateBitCast(Var, VoidPtrTy),
+ Var,
VarName,
VarName,
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
@@ -630,15 +644,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
case DeviceVarFlags::Surface:
Builder.CreateCall(
RegisterSurf,
- {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
- VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
+ {&GpuBinaryHandlePtr, Var, VarName, VarName,
+ llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
break;
case DeviceVarFlags::Texture:
Builder.CreateCall(
RegisterTex,
- {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
- VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
+ {&GpuBinaryHandlePtr, Var, VarName, VarName,
+ llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
break;
@@ -653,7 +667,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
///
/// For CUDA:
/// \code
-/// void __cuda_module_ctor(void*) {
+/// void __cuda_module_ctor() {
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
/// __cuda_register_globals(Handle);
/// }
@@ -661,7 +675,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
///
/// For HIP:
/// \code
-/// void __hip_module_ctor(void*) {
+/// void __hip_module_ctor() {
/// if (__hip_gpubin_handle == 0) {
/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
/// __hip_register_globals(__hip_gpubin_handle);
@@ -688,11 +702,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// void ** __{cuda|hip}RegisterFatBinary(void *);
llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
+ llvm::FunctionType::get(PtrTy, PtrTy, false),
addUnderscoredPrefixToName("RegisterFatBinary"));
// struct { int magic, int version, void * gpu_binary, void * dont_care };
llvm::StructType *FatbinWrapperTy =
- llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
+ llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
// Register GPU binary with the CUDA runtime, store returned handle in a
// global variable and save a reference in GpuBinaryHandle to be cleaned up
@@ -700,8 +714,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// handle so CUDA runtime can figure out what to call on the GPU side.
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
if (!CudaGpuBinaryFileName.empty()) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
- llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+ auto VFS = CGM.getFileSystem();
+ auto CudaGpuBinaryOrErr =
+ VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
CGM.getDiags().Report(diag::err_cannot_open_file)
<< CudaGpuBinaryFileName << EC.message();
@@ -711,7 +726,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
}
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
- llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+ llvm::FunctionType::get(VoidTy, false),
llvm::GlobalValue::InternalLinkage,
addUnderscoredPrefixToName("_module_ctor"), &TheModule);
llvm::BasicBlock *CtorEntryBB =
@@ -737,9 +752,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// If fatbin is available from early finalization, create a string
// literal containing the fat binary loaded from the given file.
const unsigned HIPCodeObjectAlign = 4096;
- FatBinStr =
- makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
- FatbinConstantName, HIPCodeObjectAlign);
+ FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
+ FatbinConstantName, HIPCodeObjectAlign);
} else {
// If fatbin is not available, create an external symbol
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
@@ -773,8 +787,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// For CUDA, create a string literal containing the fat binary loaded from
// the given file.
- FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
- FatbinConstantName, 8);
+ FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
+ FatbinConstantName, 8);
FatMagic = CudaFatMagic;
}
@@ -788,7 +802,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// Data.
Values.add(FatBinStr);
// Unused in fatbin v1.
- Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
+ Values.add(llvm::ConstantPointerNull::get(PtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
/*constant*/ true);
@@ -811,19 +825,21 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// The name, size, and initialization pattern of this variable is part
// of HIP ABI.
GpuBinaryHandle = new llvm::GlobalVariable(
- TheModule, VoidPtrPtrTy, /*isConstant=*/false,
- Linkage,
- /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
+ TheModule, PtrTy, /*isConstant=*/false, Linkage,
+ /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
"__hip_gpubin_handle");
+ if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
+ GpuBinaryHandle->setComdat(
+ CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
// Prevent the weak symbol in different shared libraries being merged.
if (Linkage != llvm::GlobalValue::InternalLinkage)
GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
Address GpuBinaryAddr(
- GpuBinaryHandle,
+ GpuBinaryHandle, PtrTy,
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
{
- auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+ auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
llvm::Constant *Zero =
llvm::Constant::getNullValue(HandleValue->getType());
llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
@@ -832,9 +848,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
{
CtorBuilder.SetInsertPoint(IfBlock);
// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
- llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
- RegisterFatbinFunc,
- CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+ llvm::CallInst *RegisterFatbinCall =
+ CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
CtorBuilder.CreateBr(ExitBlock);
}
@@ -842,7 +857,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CtorBuilder.SetInsertPoint(ExitBlock);
// Call __hip_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc) {
- auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+ auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
}
}
@@ -850,12 +865,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// Register binary with CUDA runtime. This is substantially different in
// default mode vs. separate compilation!
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
- llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
- RegisterFatbinFunc,
- CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+ llvm::CallInst *RegisterFatbinCall =
+ CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
GpuBinaryHandle = new llvm::GlobalVariable(
- TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
- llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+ TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
+ llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());
@@ -869,7 +883,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
// void __cudaRegisterFatBinaryEnd(void **);
llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+ llvm::FunctionType::get(VoidTy, PtrTy, false),
"__cudaRegisterFatBinaryEnd");
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
}
@@ -878,8 +892,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
SmallString<64> ModuleID;
llvm::raw_svector_ostream OS(ModuleID);
OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
- llvm::Constant *ModuleIDConstant = makeConstantString(
- std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
+ llvm::Constant *ModuleIDConstant = makeConstantArray(
+ std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
// Create an alias for the FatbinWrapper that nvcc will look for.
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
@@ -893,9 +907,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
- llvm::Value *Args[] = {RegisterGlobalsFunc,
- CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
- ModuleIDConstant,
+ llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
makeDummyFunction(getCallbackFnTy())};
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
}
@@ -922,14 +934,14 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
///
/// For CUDA:
/// \code
-/// void __cuda_module_dtor(void*) {
+/// void __cuda_module_dtor() {
/// __cudaUnregisterFatBinary(Handle);
/// }
/// \endcode
///
/// For HIP:
/// \code
-/// void __hip_module_dtor(void*) {
+/// void __hip_module_dtor() {
/// if (__hip_gpubin_handle) {
/// __hipUnregisterFatBinary(__hip_gpubin_handle);
/// __hip_gpubin_handle = 0;
@@ -943,11 +955,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// void __cudaUnregisterFatBinary(void ** handle);
llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+ llvm::FunctionType::get(VoidTy, PtrTy, false),
addUnderscoredPrefixToName("UnregisterFatBinary"));
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
- llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
+ llvm::FunctionType::get(VoidTy, false),
llvm::GlobalValue::InternalLinkage,
addUnderscoredPrefixToName("_module_dtor"), &TheModule);
@@ -956,9 +968,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);
- Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
- GpuBinaryHandle->getAlignment()));
- auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
+ Address GpuBinaryAddr(
+ GpuBinaryHandle, GpuBinaryHandle->getValueType(),
+ CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
+ auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
// There is only one HIP fat binary per linked module, however there are
// multiple destructor functions. Make sure the fat binary is unregistered
// only once.
@@ -1071,7 +1084,7 @@ void CGNVCUDARuntime::transformManagedVars() {
llvm::GlobalVariable *Var = Info.Var;
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
Info.Flags.isManaged()) {
- auto ManagedVar = new llvm::GlobalVariable(
+ auto *ManagedVar = new llvm::GlobalVariable(
CGM.getModule(), Var->getType(),
/*isConstant=*/false, Var->getLinkage(),
/*Init=*/Var->isDeclaration()
@@ -1097,6 +1110,53 @@ void CGNVCUDARuntime::transformManagedVars() {
}
}
+// Creates offloading entries for all the kernels and globals that must be
+// registered. The linker will provide a pointer to this section so we can
+// register the symbols with the linked device image.
+void CGNVCUDARuntime::createOffloadingEntries() {
+ StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
+ : "cuda_offloading_entries";
+ llvm::Module &M = CGM.getModule();
+ for (KernelInfo &I : EmittedKernels)
+ llvm::offloading::emitOffloadingEntry(
+ M, KernelHandles[I.Kernel->getName()],
+ getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
+ llvm::offloading::OffloadGlobalEntry, Section);
+
+ for (VarInfo &I : DeviceVars) {
+ uint64_t VarSize =
+ CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
+ int32_t Flags =
+ (I.Flags.isExtern()
+ ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
+ : 0) |
+ (I.Flags.isConstant()
+ ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
+ : 0) |
+ (I.Flags.isNormalized()
+ ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
+ : 0);
+ if (I.Flags.getKind() == DeviceVarFlags::Variable) {
+ llvm::offloading::emitOffloadingEntry(
+ M, I.Var, getDeviceSideName(I.D), VarSize,
+ (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
+ : llvm::offloading::OffloadGlobalEntry) |
+ Flags,
+ /*Data=*/0, Section);
+ } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
+ llvm::offloading::emitOffloadingEntry(
+ M, I.Var, getDeviceSideName(I.D), VarSize,
+ llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
+ I.Flags.getSurfTexType(), Section);
+ } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
+ llvm::offloading::emitOffloadingEntry(
+ M, I.Var, getDeviceSideName(I.D), VarSize,
+ llvm::offloading::OffloadGlobalTextureEntry | Flags,
+ I.Flags.getSurfTexType(), Section);
+ }
+ }
+}
+
// Returns module constructor to be added.
llvm::Function *CGNVCUDARuntime::finalizeModule() {
if (CGM.getLangOpts().CUDAIsDevice) {
@@ -1125,17 +1185,37 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
- return makeModuleCtorFunction();
+ if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+ createOffloadingEntries();
+ else
+ return makeModuleCtorFunction();
+
+ return nullptr;
}
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
GlobalDecl GD) {
- auto Loc = KernelHandles.find(F);
- if (Loc != KernelHandles.end())
- return Loc->second;
+ auto Loc = KernelHandles.find(F->getName());
+ if (Loc != KernelHandles.end()) {
+ auto OldHandle = Loc->second;
+ if (KernelStubs[OldHandle] == F)
+ return OldHandle;
+
+ // We've found the function name, but F itself has changed, so we need to
+ // update the references.
+ if (CGM.getLangOpts().HIP) {
+ // For HIP compilation the handle itself does not change, so we only need
+ // to update the Stub value.
+ KernelStubs[OldHandle] = F;
+ return OldHandle;
+ }
+ // For non-HIP compilation, erase the old Stub and fall-through to creating
+ // new entries.
+ KernelStubs.erase(OldHandle);
+ }
if (!CGM.getLangOpts().HIP) {
- KernelHandles[F] = F;
+ KernelHandles[F->getName()] = F;
KernelStubs[F] = F;
return F;
}
@@ -1148,7 +1228,11 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
Var->setAlignment(CGM.getPointerAlign().getAsAlign());
Var->setDSOLocal(F->isDSOLocal());
Var->setVisibility(F->getVisibility());
- KernelHandles[F] = Var;
+ auto *FD = cast<FunctionDecl>(GD.getDecl());
+ auto *FT = FD->getPrimaryTemplate();
+ if (!FT || FT->isThisDeclarationADefinition())
+ CGM.maybeSetTrivialComdat(*FD, *Var);
+ KernelHandles[F->getName()] = Var;
KernelStubs[Var] = F;
return Var;
}