diff options
Diffstat (limited to 'test/CodeGenCUDA')
| -rw-r--r-- | test/CodeGenCUDA/Inputs/device-code-2.ll | 16 | ||||
| -rw-r--r-- | test/CodeGenCUDA/Inputs/device-code.ll | 38 | ||||
| -rw-r--r-- | test/CodeGenCUDA/address-spaces.cu | 20 | ||||
| -rw-r--r-- | test/CodeGenCUDA/device-vtable.cu | 61 | ||||
| -rw-r--r-- | test/CodeGenCUDA/filter-decl.cu | 8 | ||||
| -rw-r--r-- | test/CodeGenCUDA/function-overload.cu | 214 | ||||
| -rw-r--r-- | test/CodeGenCUDA/link-device-bitcode.cu | 70 | ||||
| -rw-r--r-- | test/CodeGenCUDA/ptx-kernels.cu | 10 |
8 files changed, 430 insertions, 7 deletions
diff --git a/test/CodeGenCUDA/Inputs/device-code-2.ll b/test/CodeGenCUDA/Inputs/device-code-2.ll new file mode 100644 index 000000000000..8fde3b13ec79 --- /dev/null +++ b/test/CodeGenCUDA/Inputs/device-code-2.ll @@ -0,0 +1,16 @@ +; Simple bit of IR to mimic CUDA's libdevice. + +target triple = "nvptx-unknown-cuda" + +define double @__nv_sin(double %a) { + ret double 1.0 +} + +define double @__nv_exp(double %a) { + ret double 3.0 +} + +define double @__unused(double %a) { + ret double 2.0 +} + diff --git a/test/CodeGenCUDA/Inputs/device-code.ll b/test/CodeGenCUDA/Inputs/device-code.ll new file mode 100644 index 000000000000..5943a000c1d8 --- /dev/null +++ b/test/CodeGenCUDA/Inputs/device-code.ll @@ -0,0 +1,38 @@ +; Simple bit of IR to mimic CUDA's libdevice. We want to be +; able to link with it and we need to make sure all __nvvm_reflect +; calls are eliminated by the time PTX has been produced. + +target triple = "nvptx-unknown-cuda" + +declare i32 @__nvvm_reflect(i8*) + +@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" + +define void @unused_subfunc(float %a) { + ret void +} + +define void @used_subfunc(float %a) { + ret void +} + +define float @_Z17device_mul_or_addff(float %a, float %b) { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) + %cmp = icmp ne i32 %reflect, 0 + br i1 %cmp, label %use_mul, label %use_add + +use_mul: + %ret1 = fmul float %a, %b + br label %exit + +use_add: + %ret2 = fadd float %a, %b + br label %exit + +exit: + %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] + + call void @used_subfunc(float %ret) + + ret float %ret +} diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu index b0ef3558e2d0..31cba958e154 100644 --- a/test/CodeGenCUDA/address-spaces.cu +++ b/test/CodeGenCUDA/address-spaces.cu @@ -5,10 +5,10 @@ #include "Inputs/cuda.h" -// CHECK: @i = addrspace(1) global +// CHECK: @i = addrspace(1) externally_initialized global __device__ int i; -// CHECK: @j = addrspace(4) global +// CHECK: @j = addrspace(4) externally_initialized global __constant__ int j; // CHECK: @k = addrspace(3) global @@ -24,7 +24,9 @@ struct MyStruct { // CHECK: @_ZZ5func2vE1a = internal addrspace(3) global [256 x float] zeroinitializer // CHECK: @_ZZ5func3vE1a = internal addrspace(3) global float 0.000000e+00 // CHECK: @_ZZ5func4vE1a = internal addrspace(3) global float 0.000000e+00 -// CHECK: @b = addrspace(3) global float 0.000000e+00 +// CHECK: @b = addrspace(3) global float undef +// CHECK: @c = addrspace(3) global %struct.c undef +// CHECK @d = addrspace(3) global %struct.d undef __device__ void foo() { // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*) @@ -117,3 +119,15 @@ __device__ int construct_shared_struct() { return t.getData(); // CHECK: call i32 @_ZN14StructWithCtor7getDataEv(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*)) } + +// Make sure we allow __shared__ structures with default or empty constructors. +struct c { + int i; +}; +__shared__ struct c c; + +struct d { + int i; + d() {} +}; +__shared__ struct d d; diff --git a/test/CodeGenCUDA/device-vtable.cu b/test/CodeGenCUDA/device-vtable.cu new file mode 100644 index 000000000000..9730e404caa4 --- /dev/null +++ b/test/CodeGenCUDA/device-vtable.cu @@ -0,0 +1,61 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// Make sure we don't emit vtables for classes with methods that have +// inappropriate target attributes. Currently it's mostly needed in +// order to avoid emitting vtables for host-only classes on device +// side where we can't codegen them. + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=CHECK-HOST -check-prefix=CHECK-BOTH +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH + +#include "Inputs/cuda.h" + +struct H { + virtual void method(); +}; +//CHECK-HOST: @_ZTV1H = +//CHECK-HOST-SAME: @_ZN1H6methodEv +//CHECK-DEVICE-NOT: @_ZTV1H = + +struct D { + __device__ virtual void method(); +}; + +//CHECK-DEVICE: @_ZTV1D +//CHECK-DEVICE-SAME: @_ZN1D6methodEv +//CHECK-HOST-NOT: @_ZTV1D + +// This is the case with mixed host and device virtual methods. It's +// impossible to emit a valid vtable in that case because only host or +// only device methods would be available during host or device +// compilation. At the moment Clang (and NVCC) emit NULL pointers for +// unavailable methods, +struct HD { + virtual void h_method(); + __device__ virtual void d_method(); +}; +// CHECK-BOTH: @_ZTV2HD +// CHECK-DEVICE-NOT: @_ZN2HD8h_methodEv +// CHECK-DEVICE-SAME: null +// CHECK-DEVICE-SAME: @_ZN2HD8d_methodEv +// CHECK-HOST-SAME: @_ZN2HD8h_methodEv +// CHECK-HOST-NOT: @_ZN2HD8d_methodEv +// CHECK-HOST-SAME: null +// CHECK-BOTH-SAME: ] + +void H::method() {} +//CHECK-HOST: define void @_ZN1H6methodEv + +void __device__ D::method() {} +//CHECK-DEVICE: define void @_ZN1D6methodEv + +void __device__ HD::d_method() {} +// CHECK-DEVICE: define void @_ZN2HD8d_methodEv +// CHECK-HOST-NOT: define void @_ZN2HD8d_methodEv +void HD::h_method() {} +// CHECK-HOST: define void @_ZN2HD8h_methodEv +// CHECK-DEVICE-NOT: define void @_ZN2HD8h_methodEv + diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu index e69473f3e84b..023ae61f3af8 100644 --- a/test/CodeGenCUDA/filter-decl.cu +++ b/test/CodeGenCUDA/filter-decl.cu @@ -9,12 +9,12 @@ // CHECK-DEVICE-NOT: module asm "file scope asm is host only" __asm__("file scope asm is host only"); -// CHECK-HOST-NOT: constantdata = global -// CHECK-DEVICE: constantdata = global +// CHECK-HOST-NOT: constantdata = externally_initialized global +// CHECK-DEVICE: constantdata = externally_initialized global __constant__ char constantdata[256]; -// CHECK-HOST-NOT: devicedata = global -// CHECK-DEVICE: devicedata = global +// CHECK-HOST-NOT: devicedata = externally_initialized global +// CHECK-DEVICE: devicedata = externally_initialized global __device__ char devicedata[256]; // CHECK-HOST-NOT: shareddata = global diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu new file mode 100644 index 000000000000..a12ef82773a2 --- /dev/null +++ b/test/CodeGenCUDA/function-overload.cu @@ -0,0 +1,214 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// Make sure we handle target overloads correctly. +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -fcuda-target-overloads -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \ +// RUN: -fcuda-target-overloads -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s + +// Check target overloads handling with disabled call target checks. +// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \ +// RUN: -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST \ +// RUN: -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-HOST-NC %s +// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \ +// RUN: -fcuda-disable-target-call-checks -fcuda-target-overloads \ +// RUN: -fcuda-is-device -o - %s \ +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \ +// RUN: -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-DEVICE-NC %s + +#include "Inputs/cuda.h" + +typedef int (*fp_t)(void); +typedef void (*gp_t)(void); + +// CHECK-HOST: @hp = global i32 ()* @_Z1hv +// CHECK-HOST: @chp = global i32 ()* @ch +// CHECK-HOST: @dhp = global i32 ()* @_Z2dhv +// CHECK-HOST: @cdhp = global i32 ()* @cdh +// CHECK-HOST: @gp = global void ()* @_Z1gv + +// CHECK-BOTH-LABEL: define i32 @_Z2dhv() +__device__ int dh(void) { return 1; } +// CHECK-DEVICE: ret i32 1 +__host__ int dh(void) { return 2; } +// CHECK-HOST: ret i32 2 + +// CHECK-BOTH-LABEL: define i32 @_Z2hdv() +__host__ __device__ int hd(void) { return 3; } +// CHECK-BOTH: ret i32 3 + +// CHECK-DEVICE-LABEL: define i32 @_Z1dv() +__device__ int d(void) { return 8; } +// CHECK-DEVICE: ret i32 8 + +// CHECK-HOST-LABEL: define i32 @_Z1hv() +__host__ int h(void) { return 9; } +// CHECK-HOST: ret i32 9 + +// CHECK-BOTH-LABEL: define void @_Z1gv() +__global__ void g(void) {} +// CHECK-BOTH: ret void + +// mangled names of extern "C" __host__ __device__ functions clash +// with those of their __host__/__device__ counterparts, so +// overloading of extern "C" functions can only happen for __host__ +// and __device__ functions -- we never codegen them in the same +// compilation and therefore mangled name conflict is not a problem. + +// CHECK-BOTH-LABEL: define i32 @cdh() +extern "C" __device__ int cdh(void) {return 10;} +// CHECK-DEVICE: ret i32 10 +extern "C" __host__ int cdh(void) {return 11;} +// CHECK-HOST: ret i32 11 + +// CHECK-DEVICE-LABEL: define i32 @cd() +extern "C" __device__ int cd(void) {return 12;} +// CHECK-DEVICE: ret i32 12 + +// CHECK-HOST-LABEL: define i32 @ch() +extern "C" __host__ int ch(void) {return 13;} +// CHECK-HOST: ret i32 13 + +// CHECK-BOTH-LABEL: define i32 @chd() +extern "C" __host__ __device__ int chd(void) {return 14;} +// CHECK-BOTH: ret i32 14 + +// CHECK-HOST-LABEL: define void @_Z5hostfv() +__host__ void hostf(void) { +#if defined (NOCHECKS) + fp_t dp = d; // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp, + fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp, +#endif + fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp, + fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp, + fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp, + fp_t cdhp = cdh; // CHECK-HOST: store {{.*}} @cdh, {{.*}} %cdhp, + fp_t hdp = hd; // CHECK-HOST: store {{.*}} @_Z2hdv, {{.*}} %hdp, + fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp, + gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp, + +#if defined (NOCHECKS) + d(); // CHECK-HOST-NC: call i32 @_Z1dv() + cd(); // CHECK-HOST-NC: call i32 @cd() +#endif + h(); // CHECK-HOST: call i32 @_Z1hv() + ch(); // CHECK-HOST: call i32 @ch() + dh(); // CHECK-HOST: call i32 @_Z2dhv() + cdh(); // CHECK-HOST: call i32 @cdh() + g<<<0,0>>>(); // CHECK-HOST: call void @_Z1gv() +} + +// CHECK-DEVICE-LABEL: define void @_Z7devicefv() +__device__ void devicef(void) { + fp_t dp = d; // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp, + fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp, +#if defined (NOCHECKS) + fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp, + fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp, +#endif + fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp, + fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp, + fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp, + fp_t chdp = chd; // CHECK-DEVICE: store {{.*}} @chd, {{.*}} %chdp, + + d(); // CHECK-DEVICE: call i32 @_Z1dv() + cd(); // CHECK-DEVICE: call i32 @cd() +#if defined (NOCHECKS) + h(); // CHECK-DEVICE-NC: call i32 @_Z1hv() + ch(); // CHECK-DEVICE-NC: call i32 @ch() +#endif + dh(); // CHECK-DEVICE: call i32 @_Z2dhv() + cdh(); // CHECK-DEVICE: call i32 @cdh() +} + +// CHECK-BOTH-LABEL: define void @_Z11hostdevicefv() +__host__ __device__ void hostdevicef(void) { +#if defined (NOCHECKS) + fp_t dp = d; // CHECK-BOTH-NC: store {{.*}} @_Z1dv, {{.*}} %dp, + fp_t cdp = cd; // CHECK-BOTH-NC: store {{.*}} @cd, {{.*}} %cdp, + fp_t hp = h; // CHECK-BOTH-NC: store {{.*}} @_Z1hv, {{.*}} %hp, + fp_t chp = ch; // CHECK-BOTH-NC: store {{.*}} @ch, {{.*}} %chp, +#endif + fp_t dhp = dh; // CHECK-BOTH: store {{.*}} @_Z2dhv, {{.*}} %dhp, + fp_t cdhp = cdh; // CHECK-BOTH: store {{.*}} @cdh, {{.*}} %cdhp, + fp_t hdp = hd; // CHECK-BOTH: store {{.*}} @_Z2hdv, {{.*}} %hdp, + fp_t chdp = chd; // CHECK-BOTH: store {{.*}} @chd, {{.*}} %chdp, +#if defined (NOCHECKS) && !defined(__CUDA_ARCH__) + gp_t gp = g; // CHECK-HOST-NC: store {{.*}} @_Z1gv, {{.*}} %gp, +#endif + +#if defined (NOCHECKS) + d(); // CHECK-BOTH-NC: call i32 @_Z1dv() + cd(); // CHECK-BOTH-NC: call i32 @cd() + h(); // CHECK-BOTH-NC: call i32 @_Z1hv() + ch(); // CHECK-BOTH-NC: call i32 @ch() +#endif + dh(); // CHECK-BOTH: call i32 @_Z2dhv() + cdh(); // CHECK-BOTH: call i32 @cdh() +#if defined (NOCHECKS) && !defined(__CUDA_ARCH__) + g<<<0,0>>>(); // CHECK-HOST-NC: call void @_Z1gv() +#endif +} + +// Test for address of overloaded function resolution in the global context. +fp_t hp = h; +fp_t chp = ch; +fp_t dhp = dh; +fp_t cdhp = cdh; +gp_t gp = g; + +int x; +// Check constructors/destructors for D/H functions +struct s_cd_dh { + __host__ s_cd_dh() { x = 11; } + __device__ s_cd_dh() { x = 12; } + __host__ ~s_cd_dh() { x = 21; } + __device__ ~s_cd_dh() { x = 22; } +}; + +struct s_cd_hd { + __host__ __device__ s_cd_hd() { x = 31; } + __host__ __device__ ~s_cd_hd() { x = 32; } +}; + +// CHECK-BOTH: define void @_Z7wrapperv +#if defined(__CUDA_ARCH__) +__device__ +#else +__host__ +#endif +void wrapper() { + s_cd_dh scddh; + // CHECK-BOTH: call void @_ZN7s_cd_dhC1Ev( + s_cd_hd scdhd; + // CHECK-BOTH: call void @_ZN7s_cd_hdC1Ev + + // CHECK-BOTH: call void @_ZN7s_cd_hdD1Ev( + // CHECK-BOTH: call void @_ZN7s_cd_dhD1Ev( +} +// CHECK-BOTH: ret void + +// Now it's time to check what's been generated for the methods we used. + +// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_dhC2Ev( +// CHECK-HOST: store i32 11, +// CHECK-DEVICE: store i32 12, +// CHECK-BOTH: ret void + +// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_hdC2Ev( +// CHECK-BOTH: store i32 31, +// CHECK-BOTH: ret void + +// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_hdD2Ev( +// CHECK-BOTH: store i32 32, +// CHECK-BOTH: ret void + +// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_dhD2Ev( +// CHECK-HOST: store i32 21, +// CHECK-DEVICE: store i32 22, +// CHECK-BOTH: ret void + diff --git a/test/CodeGenCUDA/link-device-bitcode.cu b/test/CodeGenCUDA/link-device-bitcode.cu new file mode 100644 index 000000000000..de3d39c20b49 --- /dev/null +++ b/test/CodeGenCUDA/link-device-bitcode.cu @@ -0,0 +1,70 @@ +// Test for linking with CUDA's libdevice as outlined in +// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice +// +// REQUIRES: nvptx-registered-target +// +// Prepare bitcode file to link with +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \ +// RUN: %S/Inputs/device-code.ll +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t-2.bc \ +// RUN: %S/Inputs/device-code-2.ll +// +// Make sure function in device-code gets linked in and internalized. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-cuda-bitcode %t.bc -emit-llvm \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-IR +// +// Make sure we can link two bitcode files. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-cuda-bitcode %t.bc -mlink-cuda-bitcode %t-2.bc \ +// RUN: -emit-llvm -disable-llvm-passes -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-IR -check-prefix CHECK-IR-2 +// +// Make sure function in device-code gets linked but is not internalized +// without -fcuda-uses-libdevice +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-bitcode-file %t.bc -emit-llvm \ +// RUN: -disable-llvm-passes -o - %s \ +// RUN: | FileCheck %s -check-prefix CHECK-IR-NLD +// +// Make sure NVVMReflect pass is enabled in NVPTX back-end. +// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \ +// RUN: -mlink-cuda-bitcode %t.bc -S -o /dev/null %s \ +// RUN: -backend-option -debug-pass=Structure 2>&1 \ +// RUN: | FileCheck %s -check-prefix CHECK-REFLECT + +#include "Inputs/cuda.h" + +__device__ float device_mul_or_add(float a, float b); +extern "C" __device__ double __nv_sin(double x); +extern "C" __device__ double __nv_exp(double x); + +// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf( +// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf( +__device__ void should_not_be_internalized(float *data) {} + +// Make sure kernel call has not been internalized. +// CHECK-IR-LABEL: define void @_Z6kernelPfS_ +// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_( +__global__ __attribute__((used)) void kernel(float *out, float *in) { + *out = device_mul_or_add(in[0], in[1]); + *out += __nv_exp(__nv_sin(*out)); + should_not_be_internalized(out); +} + +// Make sure device_mul_or_add() is present in IR, is internal and +// calls __nvvm_reflect(). +// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff( +// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff( +// CHECK-IR: call i32 @__nvvm_reflect +// CHECK-IR: ret float + +// Make sure we've linked in and internalized only needed functions +// from the second bitcode file. +// CHECK-IR-2-LABEL: define internal double @__nv_sin +// CHECK-IR-2-LABEL: define internal double @__nv_exp +// CHECK-IR-2-NOT: double @__unused + +// Verify that NVVMReflect pass is among the passes run by NVPTX back-end. +// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1 diff --git a/test/CodeGenCUDA/ptx-kernels.cu b/test/CodeGenCUDA/ptx-kernels.cu index 658b3488fc18..6280e604f2ed 100644 --- a/test/CodeGenCUDA/ptx-kernels.cu +++ b/test/CodeGenCUDA/ptx-kernels.cu @@ -1,3 +1,7 @@ +// Make sure that __global__ functions are emitted along with correct +// annotations and are added to @llvm.used to prevent their elimination. +// REQUIRES: nvptx-registered-target +// // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s #include "Inputs/cuda.h" @@ -13,4 +17,10 @@ __global__ void global_function() { device_function(); } +// Make sure host-instantiated kernels are preserved on device side. +template <typename T> __global__ void templated_kernel(T param) {} +// CHECK-LABEL: define weak_odr void @_Z16templated_kernelIiEvT_ +void host_function() { templated_kernel<<<0,0>>>(0); } + // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1} +// CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1} |
