diff options
Diffstat (limited to 'lib/CodeGen/CGCUDANV.cpp')
-rw-r--r-- | lib/CodeGen/CGCUDANV.cpp | 249 |
1 files changed, 210 insertions, 39 deletions
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp index 1c578bd151bd..4d4038dae9cf 100644 --- a/lib/CodeGen/CGCUDANV.cpp +++ b/lib/CodeGen/CGCUDANV.cpp @@ -1,9 +1,8 @@ //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,9 +15,10 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/Decl.h" +#include "clang/Basic/Cuda.h" +#include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/Format.h" @@ -42,17 +42,28 @@ private: /// Convenience reference to the current module llvm::Module &TheModule; /// Keeps track of kernel launch stubs emitted in this module - llvm::SmallVector<llvm::Function *, 16> EmittedKernels; - llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; + struct KernelInfo { + llvm::Function *Kernel; + const Decl *D; + }; + llvm::SmallVector<KernelInfo, 16> EmittedKernels; + struct VarInfo { + llvm::GlobalVariable *Var; + const VarDecl *D; + unsigned Flag; + }; + llvm::SmallVector<VarInfo, 16> DeviceVars; /// Keeps track of variable containing handle of GPU binary. Populated by /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; + /// Mangle context for device. + std::unique_ptr<MangleContext> DeviceMC; - llvm::Constant *getSetupArgumentFn() const; - llvm::Constant *getLaunchFn() const; + llvm::FunctionCallee getSetupArgumentFn() const; + llvm::FunctionCallee getLaunchFn() const; llvm::FunctionType *getRegisterGlobalsFnTy() const; llvm::FunctionType *getCallbackFnTy() const; @@ -104,20 +115,25 @@ private: return DummyFunc; } - void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); + std::string getDeviceSideName(const Decl *ND); public: CGNVCUDARuntime(CodeGenModule &CGM); void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; - void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { - DeviceVars.push_back(std::make_pair(&Var, Flags)); + void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, + unsigned Flags) override { + DeviceVars.push_back({&Var, VD, Flags}); } /// Creates module constructor function llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function llvm::Function *makeModuleDtorFunction() override; + /// Construct and return the stub name of a kernel. + std::string getDeviceStubName(llvm::StringRef Name) const override; }; } @@ -137,7 +153,9 @@ CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), TheModule(CGM.getModule()), - RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) { + RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), + DeviceMC(CGM.getContext().createMangleContext( + CGM.getContext().getAuxTargetInfo())) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -150,7 +168,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) VoidPtrPtrTy = VoidPtrTy->getPointerTo(); } -llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { +llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { // cudaError_t cudaSetupArgument(void *, size_t, size_t) llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; return CGM.CreateRuntimeFunction( @@ -158,7 +176,7 @@ llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { addPrefixToName("SetupArgument")); } -llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { +llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { if (CGM.getLangOpts().HIP) { // hipError_t hipLaunchByPtr(char *); return CGM.CreateRuntimeFunction( @@ -186,16 +204,143 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { return llvm::FunctionType::get(VoidTy, Params, false); } +std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { + auto *ND = cast<const NamedDecl>(D); + std::string DeviceSideName; + if (DeviceMC->shouldMangleDeclName(ND)) { + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + DeviceMC->mangleName(ND, Out); + DeviceSideName = Out.str(); + } else + DeviceSideName = ND->getIdentifier()->getName(); + return DeviceSideName; +} + void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { - EmittedKernels.push_back(CGF.CurFn); - emitDeviceStubBody(CGF, Args); + // Ensure either we have different ABIs between host and device compilations, + // says host compilation following MSVC ABI but device compilation follows + // Itanium C++ ABI or, if they follow the same ABI, kernel names after + // mangling should be the same after name stubbing. The later checking is + // very important as the device kernel name being mangled in host-compilation + // is used to resolve the device binaries to be executed. Inconsistent naming + // result in undefined behavior. Even though we cannot check that naming + // directly between host- and device-compilations, the host- and + // device-mangling in host compilation could help catching certain ones. + assert((CGF.CGM.getContext().getAuxTargetInfo() && + (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() != + CGF.CGM.getContext().getTargetInfo().getCXXABI())) || + getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) == + CGF.CurFn->getName()); + + EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); + if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH)) + emitDeviceStubBodyNew(CGF, Args); + else + emitDeviceStubBodyLegacy(CGF, Args); } -void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, - FunctionArgList &Args) { +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + + // Calculate amount of space we will need for all arguments. If we have no + // args, allocate a single pointer so we still have a valid pointer to the + // argument array that we can pass to runtime, even if it will be unused. + Address KernelArgs = CGF.CreateTempAlloca( + VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", + llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); + // Store pointers to the arguments in a locally allocated launch_args. + for (unsigned i = 0; i < Args.size(); ++i) { + llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); + llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); + CGF.Builder.CreateDefaultAlignedStore( + VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); + } + + llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); + + // Lookup cudaLaunchKernel function. + // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // cudaStream_t stream); + TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); + DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); + IdentifierInfo &cudaLaunchKernelII = + CGM.getContext().Idents.get("cudaLaunchKernel"); + FunctionDecl *cudaLaunchKernelFD = nullptr; + for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { + if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) + cudaLaunchKernelFD = FD; + } + + if (cudaLaunchKernelFD == nullptr) { + CGM.Error(CGF.CurFuncDecl->getLocation(), + "Can't find declaration for cudaLaunchKernel()"); + return; + } + // Create temporary dim3 grid_dim, block_dim. + ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); + QualType Dim3Ty = GridDimParam->getType(); + Address GridDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); + Address BlockDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); + Address ShmemSize = + CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); + Address Stream = + CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); + llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, + {/*gridDim=*/GridDim.getType(), + /*blockDim=*/BlockDim.getType(), + /*ShmemSize=*/ShmemSize.getType(), + /*Stream=*/Stream.getType()}, + /*isVarArg=*/false), + "__cudaPopCallConfiguration"); + + CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, + {GridDim.getPointer(), BlockDim.getPointer(), + ShmemSize.getPointer(), Stream.getPointer()}); + + // Emit the call to cudaLaunch + llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); + CallArgList LaunchKernelArgs; + LaunchKernelArgs.add(RValue::get(Kernel), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); + LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); + LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), + cudaLaunchKernelFD->getParamDecl(3)->getType()); + LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), + cudaLaunchKernelFD->getParamDecl(4)->getType()); + LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), + cudaLaunchKernelFD->getParamDecl(5)->getType()); + + QualType QT = cudaLaunchKernelFD->getType(); + QualType CQT = QT.getCanonicalType(); + llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); + llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); + + const CGFunctionInfo &FI = + CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); + llvm::FunctionCallee cudaLaunchKernelFn = + CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); + CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), + LaunchKernelArgs); + CGF.EmitBranch(EndBlock); + + CGF.EmitBlock(EndBlock); +} + +void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, + FunctionArgList &Args) { // Emit a call to cudaSetupArgument for each arg in Args. - llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); + llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); CharUnits Offset = CharUnits::Zero(); for (const VarDecl *A : Args) { @@ -209,17 +354,17 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), }; - llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); + llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); - llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); + llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); - CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); + CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); CGF.EmitBlock(NextBlock); Offset += TyWidth; } // Emit the call to cudaLaunch - llvm::Constant *cudaLaunchFn = getLaunchFn(); + llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); @@ -259,7 +404,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { llvm::Type *RegisterFuncParams[] = { VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; - llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( + llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, RegisterFuncParams, false), addUnderscoredPrefixToName("RegisterFunction")); @@ -267,13 +412,19 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { // __cuda_register_globals() and generate __cudaRegisterFunction() call for // each emitted kernel. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); - for (llvm::Function *Kernel : EmittedKernels) { - llvm::Constant *KernelName = makeConstantString(Kernel->getName()); + for (auto &&I : EmittedKernels) { + llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); llvm::Value *Args[] = { - &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), - KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, - NullPtr, NullPtr, NullPtr, + &GpuBinaryHandlePtr, + Builder.CreateBitCast(I.Kernel, VoidPtrTy), + KernelName, + KernelName, + llvm::ConstantInt::get(IntTy, -1), + NullPtr, + NullPtr, + NullPtr, + NullPtr, llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; Builder.CreateCall(RegisterFunc, Args); } @@ -283,13 +434,13 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy, IntTy}; - llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( + llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, RegisterVarParams, false), addUnderscoredPrefixToName("RegisterVar")); - for (auto &Pair : DeviceVars) { - llvm::GlobalVariable *Var = Pair.first; - unsigned Flags = Pair.second; - llvm::Constant *VarName = makeConstantString(Var->getName()); + for (auto &&Info : DeviceVars) { + llvm::GlobalVariable *Var = Info.Var; + unsigned Flags = Info.Flag; + llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); uint64_t VarSize = CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); llvm::Value *Args[] = { @@ -329,10 +480,14 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { bool IsHIP = CGM.getLangOpts().HIP; + bool IsCUDA = CGM.getLangOpts().CUDA; // No need to generate ctors/dtors if there is no GPU binary. StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; if (CudaGpuBinaryFileName.empty() && !IsHIP) return nullptr; + if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && + DeviceVars.empty()) + return nullptr; // void __{cuda|hip}_register_globals(void* handle); llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); @@ -342,7 +497,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); // void ** __{cuda|hip}RegisterFatBinary(void *); - llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), addUnderscoredPrefixToName("RegisterFatBinary")); // struct { int magic, int version, void * gpu_binary, void * dont_care }; @@ -516,6 +671,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // Call __cuda_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); + + // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. + if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { + // void __cudaRegisterFatBinaryEnd(void **); + llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + "__cudaRegisterFatBinaryEnd"); + CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); + } } else { // Generate a unique module ID. SmallString<64> ModuleID; @@ -532,7 +697,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // void *, void (*)(void **)) SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); RegisterLinkedBinaryName += ModuleID; - llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( + llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); @@ -550,7 +715,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // extern "C" int atexit(void (*f)(void)); llvm::FunctionType *AtExitTy = llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); - llvm::Constant *AtExitFunc = + llvm::FunctionCallee AtExitFunc = CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), /*Local=*/true); CtorBuilder.CreateCall(AtExitFunc, CleanupFn); @@ -585,7 +750,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { return nullptr; // void __cudaUnregisterFatBinary(void ** handle); - llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), addUnderscoredPrefixToName("UnregisterFatBinary")); @@ -627,6 +792,12 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { return ModuleDtorFunc; } +std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const { + if (!CGM.getLangOpts().HIP) + return Name; + return (Name + ".stub").str(); +} + CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } |