1 files changed, 210 insertions, 39 deletions
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 1c578bd151bd..4d4038dae9cf 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -1,9 +1,8 @@
 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -16,9 +15,10 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/Format.h"
@@ -42,17 +42,28 @@ private:
   /// Convenience reference to the current module
   llvm::Module &TheModule;
   /// Keeps track of kernel launch stubs emitted in this module
-  llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
-  llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
+  struct KernelInfo {
+    llvm::Function *Kernel;
+    const Decl *D;
+  };
+  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+  struct VarInfo {
+    llvm::GlobalVariable *Var;
+    const VarDecl *D;
+    unsigned Flag;
+  };
+  llvm::SmallVector<VarInfo, 16> DeviceVars;
   /// Keeps track of variable containing handle of GPU binary. Populated by
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
   /// Whether we generate relocatable device code.
   bool RelocatableDeviceCode;
+  /// Mangle context for device.
+  std::unique_ptr<MangleContext> DeviceMC;
 
-  llvm::Constant *getSetupArgumentFn() const;
-  llvm::Constant *getLaunchFn() const;
+  llvm::FunctionCallee getSetupArgumentFn() const;
+  llvm::FunctionCallee getLaunchFn() const;
 
   llvm::FunctionType *getRegisterGlobalsFnTy() const;
   llvm::FunctionType *getCallbackFnTy() const;
@@ -104,20 +115,25 @@ private:
     return DummyFunc;
   }
 
-  void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
+  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
+  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
+  std::string getDeviceSideName(const Decl *ND);
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
 
   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
-  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
-    DeviceVars.push_back(std::make_pair(&Var, Flags));
+  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
+                         unsigned Flags) override {
+    DeviceVars.push_back({&Var, VD, Flags});
   }
 
   /// Creates module constructor function
   llvm::Function *makeModuleCtorFunction() override;
   /// Creates module destructor function
   llvm::Function *makeModuleDtorFunction() override;
+  /// Construct and return the stub name of a kernel.
+  std::string getDeviceStubName(llvm::StringRef Name) const override;
 };
 
 }
@@ -137,7 +153,9 @@ CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
       TheModule(CGM.getModule()),
-      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
+      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
+      DeviceMC(CGM.getContext().createMangleContext(
+          CGM.getContext().getAuxTargetInfo())) {
   CodeGen::CodeGenTypes &Types = CGM.getTypes();
   ASTContext &Ctx = CGM.getContext();
 
@@ -150,7 +168,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
 }
 
-llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
+llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
   return CGM.CreateRuntimeFunction(
@@ -158,7 +176,7 @@ llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
       addPrefixToName("SetupArgument"));
 }
 
-llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
+llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
   if (CGM.getLangOpts().HIP) {
     // hipError_t hipLaunchByPtr(char *);
     return CGM.CreateRuntimeFunction(
@@ -186,16 +204,143 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
   return llvm::FunctionType::get(VoidTy, Params, false);
 }
 
+std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
+  auto *ND = cast<const NamedDecl>(D);
+  std::string DeviceSideName;
+  if (DeviceMC->shouldMangleDeclName(ND)) {
+    SmallString<256> Buffer;
+    llvm::raw_svector_ostream Out(Buffer);
+    DeviceMC->mangleName(ND, Out);
+    DeviceSideName = Out.str();
+  } else
+    DeviceSideName = ND->getIdentifier()->getName();
+  return DeviceSideName;
+}
+
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                      FunctionArgList &Args) {
-  EmittedKernels.push_back(CGF.CurFn);
-  emitDeviceStubBody(CGF, Args);
+  // Ensure either we have different ABIs between host and device compilations,
+  // says host compilation following MSVC ABI but device compilation follows
+  // Itanium C++ ABI or, if they follow the same ABI, kernel names after
+  // mangling should be the same after name stubbing. The later checking is
+  // very important as the device kernel name being mangled in host-compilation
+  // is used to resolve the device binaries to be executed. Inconsistent naming
+  // result in undefined behavior. Even though we cannot check that naming
+  // directly between host- and device-compilations, the host- and
+  // device-mangling in host compilation could help catching certain ones.
+  assert((CGF.CGM.getContext().getAuxTargetInfo() &&
+          (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() !=
+           CGF.CGM.getContext().getTargetInfo().getCXXABI())) ||
+         getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) ==
+             CGF.CurFn->getName());
+
+  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
+  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+    emitDeviceStubBodyNew(CGF, Args);
+  else
+    emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
-                                         FunctionArgList &Args) {
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            FunctionArgList &Args) {
+  // Build the shadow stack entry at the very start of the function.
+
+  // Calculate amount of space we will need for all arguments.  If we have no
+  // args, allocate a single pointer so we still have a valid pointer to the
+  // argument array that we can pass to runtime, even if it will be unused.
+  Address KernelArgs = CGF.CreateTempAlloca(
+      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
+  // Store pointers to the arguments in a locally allocated launch_args.
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
+    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
+    CGF.Builder.CreateDefaultAlignedStore(
+        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
+  }
+
+  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+
+  // Lookup cudaLaunchKernel function.
+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  //                              void **args, size_t sharedMem,
+  //                              cudaStream_t stream);
+  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+  IdentifierInfo &cudaLaunchKernelII =
+      CGM.getContext().Idents.get("cudaLaunchKernel");
+  FunctionDecl *cudaLaunchKernelFD = nullptr;
+  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
+    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
+      cudaLaunchKernelFD = FD;
+  }
+
+  if (cudaLaunchKernelFD == nullptr) {
+    CGM.Error(CGF.CurFuncDecl->getLocation(),
+              "Can't find declaration for cudaLaunchKernel()");
+    return;
+  }
+  // Create temporary dim3 grid_dim, block_dim.
+  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
+  QualType Dim3Ty = GridDimParam->getType();
+  Address GridDim =
+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
+  Address BlockDim =
+      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
+  Address ShmemSize =
+      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
+  Address Stream =
+      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
+  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy,
+                              {/*gridDim=*/GridDim.getType(),
+                               /*blockDim=*/BlockDim.getType(),
+                               /*ShmemSize=*/ShmemSize.getType(),
+                               /*Stream=*/Stream.getType()},
+                              /*isVarArg=*/false),
+      "__cudaPopCallConfiguration");
+
+  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
+                              {GridDim.getPointer(), BlockDim.getPointer(),
+                               ShmemSize.getPointer(), Stream.getPointer()});
+
+  // Emit the call to cudaLaunch
+  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+  CallArgList LaunchKernelArgs;
+  LaunchKernelArgs.add(RValue::get(Kernel),
+                       cudaLaunchKernelFD->getParamDecl(0)->getType());
+  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
+  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
+  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
+                       cudaLaunchKernelFD->getParamDecl(3)->getType());
+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
+                       cudaLaunchKernelFD->getParamDecl(4)->getType());
+  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
+                       cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+  QualType QT = cudaLaunchKernelFD->getType();
+  QualType CQT = QT.getCanonicalType();
+  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
+  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
+
+  const CGFunctionInfo &FI =
+      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+  llvm::FunctionCallee cudaLaunchKernelFn =
+      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
+  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
+               LaunchKernelArgs);
+  CGF.EmitBranch(EndBlock);
+
+  CGF.EmitBlock(EndBlock);
+}
+
+void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
+                                               FunctionArgList &Args) {
   // Emit a call to cudaSetupArgument for each arg in Args.
-  llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
+  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
   CharUnits Offset = CharUnits::Zero();
   for (const VarDecl *A : Args) {
@@ -209,17 +354,17 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
         llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
     };
-    llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
+    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
-    llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
+    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
-    CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
+    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
     CGF.EmitBlock(NextBlock);
     Offset += TyWidth;
   }
 
   // Emit the call to cudaLaunch
-  llvm::Constant *cudaLaunchFn = getLaunchFn();
+  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
   CGF.EmitBranch(EndBlock);
@@ -259,7 +404,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   llvm::Type *RegisterFuncParams[] = {
       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
-  llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
+  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
       addUnderscoredPrefixToName("RegisterFunction"));
 
@@ -267,13 +412,19 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
   // each emitted kernel.
   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
-  for (llvm::Function *Kernel : EmittedKernels) {
-    llvm::Constant *KernelName = makeConstantString(Kernel->getName());
+  for (auto &&I : EmittedKernels) {
+    llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
     llvm::Value *Args[] = {
-        &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
-        KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
-        NullPtr, NullPtr, NullPtr,
+        &GpuBinaryHandlePtr,
+        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+        KernelName,
+        KernelName,
+        llvm::ConstantInt::get(IntTy, -1),
+        NullPtr,
+        NullPtr,
+        NullPtr,
+        NullPtr,
         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
     Builder.CreateCall(RegisterFunc, Args);
   }
@@ -283,13 +434,13 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
                                      CharPtrTy,    IntTy,     IntTy,
                                      IntTy,        IntTy};
-  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
+  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(IntTy, RegisterVarParams, false),
       addUnderscoredPrefixToName("RegisterVar"));
-  for (auto &Pair : DeviceVars) {
-    llvm::GlobalVariable *Var = Pair.first;
-    unsigned Flags = Pair.second;
-    llvm::Constant *VarName = makeConstantString(Var->getName());
+  for (auto &&Info : DeviceVars) {
+    llvm::GlobalVariable *Var = Info.Var;
+    unsigned Flags = Info.Flag;
+    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
     uint64_t VarSize =
         CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
     llvm::Value *Args[] = {
@@ -329,10 +480,14 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   bool IsHIP = CGM.getLangOpts().HIP;
+  bool IsCUDA = CGM.getLangOpts().CUDA;
   // No need to generate ctors/dtors if there is no GPU binary.
   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
   if (CudaGpuBinaryFileName.empty() && !IsHIP)
     return nullptr;
+  if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
+      DeviceVars.empty())
+    return nullptr;
 
   // void __{cuda|hip}_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
@@ -342,7 +497,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
 
   // void ** __{cuda|hip}RegisterFatBinary(void *);
-  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
+  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
       addUnderscoredPrefixToName("RegisterFatBinary"));
   // struct { int magic, int version, void * gpu_binary, void * dont_care };
@@ -516,6 +671,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     // Call __cuda_register_globals(GpuBinaryHandle);
     if (RegisterGlobalsFunc)
       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
+
+    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
+    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
+                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
+      // void __cudaRegisterFatBinaryEnd(void **);
+      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
+          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
+          "__cudaRegisterFatBinaryEnd");
+      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
+    }
   } else {
     // Generate a unique module ID.
     SmallString<64> ModuleID;
@@ -532,7 +697,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     // void *, void (*)(void **))
     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
     RegisterLinkedBinaryName += ModuleID;
-    llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
+    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
 
     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
@@ -550,7 +715,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     // extern "C" int atexit(void (*f)(void));
     llvm::FunctionType *AtExitTy =
         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
-    llvm::Constant *AtExitFunc =
+    llvm::FunctionCallee AtExitFunc =
         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
                                   /*Local=*/true);
     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
@@ -585,7 +750,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
     return nullptr;
 
   // void __cudaUnregisterFatBinary(void ** handle);
-  llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
+  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
       addUnderscoredPrefixToName("UnregisterFatBinary"));
 
@@ -627,6 +792,12 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
   return ModuleDtorFunc;
 }
 
+std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const {
+  if (!CGM.getLangOpts().HIP)
+    return Name;
+  return (Name + ".stub").str();
+}
+
 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
   return new CGNVCUDARuntime(CGM);
 }