diff options
Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp')
-rw-r--r-- | contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp | 328 |
1 files changed, 206 insertions, 122 deletions
diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp index 88030fee501b..5b43272bfa62 100644 --- a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp +++ b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp @@ -19,11 +19,13 @@ #include "clang/Basic/Cuda.h" #include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Format.h" +#include "llvm/Support/VirtualFileSystem.h" using namespace clang; using namespace CodeGen; @@ -37,7 +39,7 @@ class CGNVCUDARuntime : public CGCUDARuntime { private: llvm::IntegerType *IntTy, *SizeTy; llvm::Type *VoidTy; - llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; + llvm::PointerType *PtrTy; /// Convenience reference to LLVM Context llvm::LLVMContext &Context; @@ -49,10 +51,10 @@ private: const Decl *D; }; llvm::SmallVector<KernelInfo, 16> EmittedKernels; - // Map a device stub function to a symbol for identifying kernel in host code. + // Map a kernel mangled name to a symbol for identifying kernel in host code // For CUDA, the symbol for identifying the kernel is the same as the device // stub function. For HIP, they are different. - llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles; + llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; // Map a kernel handle to the kernel stub. llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; struct VarInfo { @@ -69,6 +71,8 @@ private: bool RelocatableDeviceCode; /// Mangle context for device. std::unique_ptr<MangleContext> DeviceMC; + /// Some zeros used for GEPs. + llvm::Constant *Zeros[2]; llvm::FunctionCallee getSetupArgumentFn() const; llvm::FunctionCallee getLaunchFn() const; @@ -86,14 +90,25 @@ private: /// the start of the string. The result of this function can be used anywhere /// where the C code specifies const char*. llvm::Constant *makeConstantString(const std::string &Str, - const std::string &Name = "", - const std::string &SectionName = "", - unsigned Alignment = 0) { - llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), - llvm::ConstantInt::get(SizeTy, 0)}; + const std::string &Name = "") { auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); - llvm::GlobalVariable *GV = - cast<llvm::GlobalVariable>(ConstStr.getPointer()); + return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), + ConstStr.getPointer(), Zeros); + } + + /// Helper function which generates an initialized constant array from Str, + /// and optionally sets section name and alignment. AddNull specifies whether + /// the array should nave NUL termination. + llvm::Constant *makeConstantArray(StringRef Str, + StringRef Name = "", + StringRef SectionName = "", + unsigned Alignment = 0, + bool AddNull = false) { + llvm::Constant *Value = + llvm::ConstantDataArray::getString(Context, Str, AddNull); + auto *GV = new llvm::GlobalVariable( + TheModule, Value->getType(), /*isConstant=*/true, + llvm::GlobalValue::PrivateLinkage, Value, Name); if (!SectionName.empty()) { GV->setSection(SectionName); // Mark the address as used which make sure that this section isn't @@ -102,9 +117,7 @@ private: } if (Alignment) GV->setAlignment(llvm::Align(Alignment)); - - return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), - ConstStr.getPointer(), Zeros); + return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros); } /// Helper function that generates an empty dummy function returning void. @@ -157,6 +170,8 @@ private: llvm::Function *makeModuleDtorFunction(); /// Transform managed variables for device compilation. void transformManagedVars(); + /// Create offloading entries to register globals in RDC mode. + void createOffloadingEntries(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -177,7 +192,7 @@ public: llvm::Function *finalizeModule() override; }; -} +} // end anonymous namespace std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { if (CGM.getLangOpts().HIP) @@ -212,21 +227,17 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) TheModule(CGM.getModule()), RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), DeviceMC(InitDeviceMC(CGM)) { - CodeGen::CodeGenTypes &Types = CGM.getTypes(); - ASTContext &Ctx = CGM.getContext(); - IntTy = CGM.IntTy; SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; - - CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); - VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); - VoidPtrPtrTy = VoidPtrTy->getPointerTo(); + Zeros[0] = llvm::ConstantInt::get(SizeTy, 0); + Zeros[1] = Zeros[0]; + PtrTy = CGM.UnqualPtrTy; } llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { // cudaError_t cudaSetupArgument(void *, size_t, size_t) - llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; + llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy}; return CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, Params, false), addPrefixToName("SetupArgument")); @@ -236,27 +247,24 @@ llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { if (CGM.getLangOpts().HIP) { // hipError_t hipLaunchByPtr(char *); return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); - } else { - // cudaError_t cudaLaunch(char *); - return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); + llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr"); } + // cudaError_t cudaLaunch(char *); + return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false), + "cudaLaunch"); } llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { - return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); + return llvm::FunctionType::get(VoidTy, PtrTy, false); } llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { - return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); + return llvm::FunctionType::get(VoidTy, PtrTy, false); } llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { - auto CallbackFnTy = getCallbackFnTy(); - auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); - llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, - VoidPtrTy, CallbackFnTy->getPointerTo()}; + llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy, + llvm::PointerType::getUnqual(Context)}; return llvm::FunctionType::get(VoidTy, Params, false); } @@ -282,13 +290,12 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { DeviceSideName = std::string(ND->getIdentifier()->getName()); // Make unique name for device side static file-scope variable for HIP. - if (CGM.getContext().shouldExternalizeStaticVar(ND) && - CGM.getLangOpts().GPURelocatableDeviceCode && - !CGM.getLangOpts().CUID.empty()) { + if (CGM.getContext().shouldExternalize(ND) && + CGM.getLangOpts().GPURelocatableDeviceCode) { SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); Out << DeviceSideName; - CGM.printPostfixForExternalizedStaticVar(Out); + CGM.printPostfixForExternalizedDecl(Out, ND); DeviceSideName = std::string(Out.str()); } return DeviceSideName; @@ -297,7 +304,8 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); - if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) { + if (auto *GV = + dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) { GV->setLinkage(CGF.CurFn->getLinkage()); GV->setInitializer(CGF.CurFn); } @@ -319,29 +327,40 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. Address KernelArgs = CGF.CreateTempAlloca( - VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", + PtrTy, CharUnits::fromQuantity(16), "kernel_args", llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); // Store pointers to the arguments in a locally allocated launch_args. for (unsigned i = 0; i < Args.size(); ++i) { llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); - llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); + llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy); CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, - CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i)); + CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i)); } llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel/hipLaunchKernel function. + // HIP kernel launching API name depends on -fgpu-default-stream option. For + // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', + // it is hipLaunchKernel_spt. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); - // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, - // void **args, size_t sharedMem, - // hipStream_t stream); + // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, + // dim3 blockDim, void **args, + // size_t sharedMem, hipStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); - auto LaunchKernelName = addPrefixToName("LaunchKernel"); + std::string KernelLaunchAPI = "LaunchKernel"; + if (CGF.getLangOpts().GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) { + if (CGF.getLangOpts().HIP) + KernelLaunchAPI = KernelLaunchAPI + "_spt"; + else if (CGF.getLangOpts().CUDA) + KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; + } + auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; @@ -364,8 +383,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); Address ShmemSize = CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); - Address Stream = - CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); + Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream"); llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {/*gridDim=*/GridDim.getType(), @@ -381,7 +399,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, // Emit the call to cudaLaunch llvm::Value *Kernel = - CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); @@ -397,7 +415,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, QualType QT = cudaLaunchKernelFD->getType(); QualType CQT = QT.getCanonicalType(); llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); - llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); + llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); @@ -421,7 +439,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, Offset = Offset.alignTo(TInfo.Align); llvm::Value *Args[] = { CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), - VoidPtrTy), + PtrTy), llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), }; @@ -437,7 +455,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, // Emit the call to cudaLaunch llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); llvm::Value *Arg = - CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); @@ -473,7 +491,7 @@ static void replaceManagedVar(llvm::GlobalVariable *Var, // variable with instructions. for (auto &&Op : WorkItem) { auto *CE = cast<llvm::ConstantExpr>(Op); - auto *NewInst = llvm::createReplacementInstr(CE, I); + auto *NewInst = CE->getAsInstruction(I); NewInst->replaceUsesOfWith(OldV, NewV); OldV = CE; NewV = NewInst; @@ -515,8 +533,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { // void __cudaRegisterFunction(void **, const char *, char *, const char *, // int, uint3*, uint3*, dim3*, dim3*, int*) llvm::Type *RegisterFuncParams[] = { - VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, - VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; + PtrTy, PtrTy, PtrTy, PtrTy, IntTy, + PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)}; llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, RegisterFuncParams, false), addUnderscoredPrefixToName("RegisterFunction")); @@ -528,10 +546,10 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { for (auto &&I : EmittedKernels) { llvm::Constant *KernelName = makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); - llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); + llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy); llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), + KernelHandles[I.Kernel->getName()], KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), @@ -539,7 +557,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { NullPtr, NullPtr, NullPtr, - llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; + llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))}; Builder.CreateCall(RegisterFunc, Args); } @@ -551,16 +569,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { // void __cudaRegisterVar(void **, char *, char *, const char *, // int, int, int, int) - llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, - CharPtrTy, IntTy, VarSizeTy, - IntTy, IntTy}; + llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy, + IntTy, VarSizeTy, IntTy, IntTy}; llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, RegisterVarParams, false), addUnderscoredPrefixToName("RegisterVar")); // void __hipRegisterManagedVar(void **, char *, char *, const char *, // size_t, unsigned) - llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, - CharPtrTy, VarSizeTy, IntTy}; + llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, + PtrTy, VarSizeTy, IntTy}; llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), addUnderscoredPrefixToName("RegisterManagedVar")); @@ -568,16 +585,13 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { // const void **, const char *, int, int); llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( llvm::FunctionType::get( - VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, - false), + VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false), addUnderscoredPrefixToName("RegisterSurface")); // void __cudaRegisterTexture(void **, const struct textureReference *, // const void **, const char *, int, int, int) llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( llvm::FunctionType::get( - VoidTy, - {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, - false), + VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false), addUnderscoredPrefixToName("RegisterTexture")); for (auto &&Info : DeviceVars) { llvm::GlobalVariable *Var = Info.Var; @@ -590,7 +604,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { uint64_t VarSize = CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); if (Info.Flags.isManaged()) { - auto ManagedVar = new llvm::GlobalVariable( + auto *ManagedVar = new llvm::GlobalVariable( CGM.getModule(), Var->getType(), /*isConstant=*/false, Var->getLinkage(), /*Init=*/Var->isDeclaration() @@ -606,8 +620,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { replaceManagedVar(Var, ManagedVar); llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(ManagedVar, VoidPtrTy), - Builder.CreateBitCast(Var, VoidPtrTy), + ManagedVar, + Var, VarName, llvm::ConstantInt::get(VarSizeTy, VarSize), llvm::ConstantInt::get(IntTy, Var->getAlignment())}; @@ -616,7 +630,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } else { llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(Var, VoidPtrTy), + Var, VarName, VarName, llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), @@ -630,15 +644,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { case DeviceVarFlags::Surface: Builder.CreateCall( RegisterSurf, - {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, - VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), + {&GpuBinaryHandlePtr, Var, VarName, VarName, + llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); break; case DeviceVarFlags::Texture: Builder.CreateCall( RegisterTex, - {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, - VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), + {&GpuBinaryHandlePtr, Var, VarName, VarName, + llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); break; @@ -653,7 +667,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { /// /// For CUDA: /// \code -/// void __cuda_module_ctor(void*) { +/// void __cuda_module_ctor() { /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); /// __cuda_register_globals(Handle); /// } @@ -661,7 +675,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { /// /// For HIP: /// \code -/// void __hip_module_ctor(void*) { +/// void __hip_module_ctor() { /// if (__hip_gpubin_handle == 0) { /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); /// __hip_register_globals(__hip_gpubin_handle); @@ -688,11 +702,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // void ** __{cuda|hip}RegisterFatBinary(void *); llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), + llvm::FunctionType::get(PtrTy, PtrTy, false), addUnderscoredPrefixToName("RegisterFatBinary")); // struct { int magic, int version, void * gpu_binary, void * dont_care }; llvm::StructType *FatbinWrapperTy = - llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); + llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy); // Register GPU binary with the CUDA runtime, store returned handle in a // global variable and save a reference in GpuBinaryHandle to be cleaned up @@ -700,8 +714,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // handle so CUDA runtime can figure out what to call on the GPU side. std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; if (!CudaGpuBinaryFileName.empty()) { - llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); + auto VFS = CGM.getFileSystem(); + auto CudaGpuBinaryOrErr = + VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false); if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { CGM.getDiags().Report(diag::err_cannot_open_file) << CudaGpuBinaryFileName << EC.message(); @@ -711,7 +726,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { } llvm::Function *ModuleCtorFunc = llvm::Function::Create( - llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::FunctionType::get(VoidTy, false), llvm::GlobalValue::InternalLinkage, addUnderscoredPrefixToName("_module_ctor"), &TheModule); llvm::BasicBlock *CtorEntryBB = @@ -737,9 +752,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // If fatbin is available from early finalization, create a string // literal containing the fat binary loaded from the given file. const unsigned HIPCodeObjectAlign = 4096; - FatBinStr = - makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", - FatbinConstantName, HIPCodeObjectAlign); + FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", + FatbinConstantName, HIPCodeObjectAlign); } else { // If fatbin is not available, create an external symbol // __hip_fatbin in section .hip_fatbin. The external symbol is supposed @@ -773,8 +787,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // For CUDA, create a string literal containing the fat binary loaded from // the given file. - FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", - FatbinConstantName, 8); + FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", + FatbinConstantName, 8); FatMagic = CudaFatMagic; } @@ -788,7 +802,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // Data. Values.add(FatBinStr); // Unused in fatbin v1. - Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); + Values.add(llvm::ConstantPointerNull::get(PtrTy)); llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), /*constant*/ true); @@ -811,19 +825,21 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // The name, size, and initialization pattern of this variable is part // of HIP ABI. GpuBinaryHandle = new llvm::GlobalVariable( - TheModule, VoidPtrPtrTy, /*isConstant=*/false, - Linkage, - /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), + TheModule, PtrTy, /*isConstant=*/false, Linkage, + /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy), "__hip_gpubin_handle"); + if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) + GpuBinaryHandle->setComdat( + CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); // Prevent the weak symbol in different shared libraries being merged. if (Linkage != llvm::GlobalValue::InternalLinkage) GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); Address GpuBinaryAddr( - GpuBinaryHandle, + GpuBinaryHandle, PtrTy, CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); { - auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); @@ -832,9 +848,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { { CtorBuilder.SetInsertPoint(IfBlock); // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); - llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( - RegisterFatbinFunc, - CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + llvm::CallInst *RegisterFatbinCall = + CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); CtorBuilder.CreateBr(ExitBlock); } @@ -842,7 +857,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { CtorBuilder.SetInsertPoint(ExitBlock); // Call __hip_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) { - auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); } } @@ -850,12 +865,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // Register binary with CUDA runtime. This is substantially different in // default mode vs. separate compilation! // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); - llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( - RegisterFatbinFunc, - CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + llvm::CallInst *RegisterFatbinCall = + CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); GpuBinaryHandle = new llvm::GlobalVariable( - TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle"); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); @@ -869,7 +883,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { // void __cudaRegisterFatBinaryEnd(void **); llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + llvm::FunctionType::get(VoidTy, PtrTy, false), "__cudaRegisterFatBinaryEnd"); CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); } @@ -878,8 +892,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { SmallString<64> ModuleID; llvm::raw_svector_ostream OS(ModuleID); OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); - llvm::Constant *ModuleIDConstant = makeConstantString( - std::string(ModuleID.str()), "", ModuleIDSectionName, 32); + llvm::Constant *ModuleIDConstant = makeConstantArray( + std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true); // Create an alias for the FatbinWrapper that nvcc will look for. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, @@ -893,9 +907,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); - llvm::Value *Args[] = {RegisterGlobalsFunc, - CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), - ModuleIDConstant, + llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant, makeDummyFunction(getCallbackFnTy())}; CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); } @@ -922,14 +934,14 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { /// /// For CUDA: /// \code -/// void __cuda_module_dtor(void*) { +/// void __cuda_module_dtor() { /// __cudaUnregisterFatBinary(Handle); /// } /// \endcode /// /// For HIP: /// \code -/// void __hip_module_dtor(void*) { +/// void __hip_module_dtor() { /// if (__hip_gpubin_handle) { /// __hipUnregisterFatBinary(__hip_gpubin_handle); /// __hip_gpubin_handle = 0; @@ -943,11 +955,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { // void __cudaUnregisterFatBinary(void ** handle); llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + llvm::FunctionType::get(VoidTy, PtrTy, false), addUnderscoredPrefixToName("UnregisterFatBinary")); llvm::Function *ModuleDtorFunc = llvm::Function::Create( - llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::FunctionType::get(VoidTy, false), llvm::GlobalValue::InternalLinkage, addUnderscoredPrefixToName("_module_dtor"), &TheModule); @@ -956,9 +968,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { CGBuilderTy DtorBuilder(CGM, Context); DtorBuilder.SetInsertPoint(DtorEntryBB); - Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( - GpuBinaryHandle->getAlignment())); - auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); + Address GpuBinaryAddr( + GpuBinaryHandle, GpuBinaryHandle->getValueType(), + CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); + auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); // There is only one HIP fat binary per linked module, however there are // multiple destructor functions. Make sure the fat binary is unregistered // only once. @@ -1071,7 +1084,7 @@ void CGNVCUDARuntime::transformManagedVars() { llvm::GlobalVariable *Var = Info.Var; if (Info.Flags.getKind() == DeviceVarFlags::Variable && Info.Flags.isManaged()) { - auto ManagedVar = new llvm::GlobalVariable( + auto *ManagedVar = new llvm::GlobalVariable( CGM.getModule(), Var->getType(), /*isConstant=*/false, Var->getLinkage(), /*Init=*/Var->isDeclaration() @@ -1097,6 +1110,53 @@ void CGNVCUDARuntime::transformManagedVars() { } } +// Creates offloading entries for all the kernels and globals that must be +// registered. The linker will provide a pointer to this section so we can +// register the symbols with the linked device image. +void CGNVCUDARuntime::createOffloadingEntries() { + StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" + : "cuda_offloading_entries"; + llvm::Module &M = CGM.getModule(); + for (KernelInfo &I : EmittedKernels) + llvm::offloading::emitOffloadingEntry( + M, KernelHandles[I.Kernel->getName()], + getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0, + llvm::offloading::OffloadGlobalEntry, Section); + + for (VarInfo &I : DeviceVars) { + uint64_t VarSize = + CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); + int32_t Flags = + (I.Flags.isExtern() + ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern) + : 0) | + (I.Flags.isConstant() + ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant) + : 0) | + (I.Flags.isNormalized() + ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized) + : 0); + if (I.Flags.getKind() == DeviceVarFlags::Variable) { + llvm::offloading::emitOffloadingEntry( + M, I.Var, getDeviceSideName(I.D), VarSize, + (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry + : llvm::offloading::OffloadGlobalEntry) | + Flags, + /*Data=*/0, Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { + llvm::offloading::emitOffloadingEntry( + M, I.Var, getDeviceSideName(I.D), VarSize, + llvm::offloading::OffloadGlobalSurfaceEntry | Flags, + I.Flags.getSurfTexType(), Section); + } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { + llvm::offloading::emitOffloadingEntry( + M, I.Var, getDeviceSideName(I.D), VarSize, + llvm::offloading::OffloadGlobalTextureEntry | Flags, + I.Flags.getSurfTexType(), Section); + } + } +} + // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { if (CGM.getLangOpts().CUDAIsDevice) { @@ -1125,17 +1185,37 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { } return nullptr; } - return makeModuleCtorFunction(); + if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) + createOffloadingEntries(); + else + return makeModuleCtorFunction(); + + return nullptr; } llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, GlobalDecl GD) { - auto Loc = KernelHandles.find(F); - if (Loc != KernelHandles.end()) - return Loc->second; + auto Loc = KernelHandles.find(F->getName()); + if (Loc != KernelHandles.end()) { + auto OldHandle = Loc->second; + if (KernelStubs[OldHandle] == F) + return OldHandle; + + // We've found the function name, but F itself has changed, so we need to + // update the references. + if (CGM.getLangOpts().HIP) { + // For HIP compilation the handle itself does not change, so we only need + // to update the Stub value. + KernelStubs[OldHandle] = F; + return OldHandle; + } + // For non-HIP compilation, erase the old Stub and fall-through to creating + // new entries. + KernelStubs.erase(OldHandle); + } if (!CGM.getLangOpts().HIP) { - KernelHandles[F] = F; + KernelHandles[F->getName()] = F; KernelStubs[F] = F; return F; } @@ -1148,7 +1228,11 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, Var->setAlignment(CGM.getPointerAlign().getAsAlign()); Var->setDSOLocal(F->isDSOLocal()); Var->setVisibility(F->getVisibility()); - KernelHandles[F] = Var; + auto *FD = cast<FunctionDecl>(GD.getDecl()); + auto *FT = FD->getPrimaryTemplate(); + if (!FT || FT->isThisDeclarationADefinition()) + CGM.maybeSetTrivialComdat(*FD, *Var); + KernelHandles[F->getName()] = Var; KernelStubs[Var] = F; return Var; } |