8 files changed, 430 insertions, 7 deletions
diff --git a/test/CodeGenCUDA/Inputs/device-code-2.ll b/test/CodeGenCUDA/Inputs/device-code-2.ll
new file mode 100644
index 000000000000..8fde3b13ec79
--- /dev/null
+++ b/test/CodeGenCUDA/Inputs/device-code-2.ll
@@ -0,0 +1,16 @@
+; Simple bit of IR to mimic CUDA's libdevice.
+
+target triple = "nvptx-unknown-cuda"
+
+define double @__nv_sin(double %a) {
+       ret double 1.0
+}
+
+define double @__nv_exp(double %a) {
+       ret double 3.0
+}
+
+define double @__unused(double %a) {
+       ret double 2.0
+}
+
diff --git a/test/CodeGenCUDA/Inputs/device-code.ll b/test/CodeGenCUDA/Inputs/device-code.ll
new file mode 100644
index 000000000000..5943a000c1d8
--- /dev/null
+++ b/test/CodeGenCUDA/Inputs/device-code.ll
@@ -0,0 +1,38 @@
+; Simple bit of IR to mimic CUDA's libdevice. We want to be
+; able to link with it and we need to make sure all __nvvm_reflect
+; calls are eliminated by the time PTX has been produced.
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @__nvvm_reflect(i8*)
+
+@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
+
+define void @unused_subfunc(float %a) {
+       ret void
+}
+
+define void @used_subfunc(float %a) {
+       ret void
+}
+
+define float @_Z17device_mul_or_addff(float %a, float %b) {
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %cmp = icmp ne i32 %reflect, 0
+  br i1 %cmp, label %use_mul, label %use_add
+
+use_mul:
+  %ret1 = fmul float %a, %b
+  br label %exit
+
+use_add:
+  %ret2 = fadd float %a, %b
+  br label %exit
+
+exit:
+  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
+
+  call void @used_subfunc(float %ret)
+
+  ret float %ret
+}
diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu
index b0ef3558e2d0..31cba958e154 100644
--- a/test/CodeGenCUDA/address-spaces.cu
+++ b/test/CodeGenCUDA/address-spaces.cu
@@ -5,10 +5,10 @@
 
 #include "Inputs/cuda.h"
 
-// CHECK: @i = addrspace(1) global
+// CHECK: @i = addrspace(1) externally_initialized global
 __device__ int i;
 
-// CHECK: @j = addrspace(4) global
+// CHECK: @j = addrspace(4) externally_initialized global
 __constant__ int j;
 
 // CHECK: @k = addrspace(3) global
@@ -24,7 +24,9 @@ struct MyStruct {
 // CHECK: @_ZZ5func2vE1a = internal addrspace(3) global [256 x float] zeroinitializer
 // CHECK: @_ZZ5func3vE1a = internal addrspace(3) global float 0.000000e+00
 // CHECK: @_ZZ5func4vE1a = internal addrspace(3) global float 0.000000e+00
-// CHECK: @b = addrspace(3) global float 0.000000e+00
+// CHECK: @b = addrspace(3) global float undef
+// CHECK: @c = addrspace(3) global %struct.c undef
+// CHECK  @d = addrspace(3) global %struct.d undef
 
 __device__ void foo() {
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
@@ -117,3 +119,15 @@ __device__ int construct_shared_struct() {
   return t.getData();
 // CHECK: call i32 @_ZN14StructWithCtor7getDataEv(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*))
 }
+
+// Make sure we allow __shared__ structures with default or empty constructors.
+struct c {
+  int i;
+};
+__shared__ struct c c;
+
+struct d {
+  int i;
+  d() {}
+};
+__shared__ struct d d;
diff --git a/test/CodeGenCUDA/device-vtable.cu b/test/CodeGenCUDA/device-vtable.cu
new file mode 100644
index 000000000000..9730e404caa4
--- /dev/null
+++ b/test/CodeGenCUDA/device-vtable.cu
@@ -0,0 +1,61 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Make sure we don't emit vtables for classes with methods that have
+// inappropriate target attributes. Currently it's mostly needed in
+// order to avoid emitting vtables for host-only classes on device
+// side where we can't codegen them.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \
+// RUN:     | FileCheck %s -check-prefix=CHECK-HOST -check-prefix=CHECK-BOTH
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH
+
+#include "Inputs/cuda.h"
+
+struct H  {
+  virtual void method();
+};
+//CHECK-HOST: @_ZTV1H =
+//CHECK-HOST-SAME: @_ZN1H6methodEv
+//CHECK-DEVICE-NOT: @_ZTV1H =
+
+struct D  {
+   __device__ virtual void method();
+};
+
+//CHECK-DEVICE: @_ZTV1D
+//CHECK-DEVICE-SAME: @_ZN1D6methodEv
+//CHECK-HOST-NOT: @_ZTV1D
+
+// This is the case with mixed host and device virtual methods.  It's
+// impossible to emit a valid vtable in that case because only host or
+// only device methods would be available during host or device
+// compilation. At the moment Clang (and NVCC) emit NULL pointers for
+// unavailable methods,
+struct HD  {
+  virtual void h_method();
+  __device__ virtual void d_method();
+};
+// CHECK-BOTH: @_ZTV2HD
+// CHECK-DEVICE-NOT: @_ZN2HD8h_methodEv
+// CHECK-DEVICE-SAME: null
+// CHECK-DEVICE-SAME: @_ZN2HD8d_methodEv
+// CHECK-HOST-SAME: @_ZN2HD8h_methodEv
+// CHECK-HOST-NOT: @_ZN2HD8d_methodEv
+// CHECK-HOST-SAME: null
+// CHECK-BOTH-SAME: ]
+
+void H::method() {}
+//CHECK-HOST: define void @_ZN1H6methodEv
+
+void __device__ D::method() {}
+//CHECK-DEVICE: define void @_ZN1D6methodEv
+
+void __device__ HD::d_method() {}
+// CHECK-DEVICE: define void @_ZN2HD8d_methodEv
+// CHECK-HOST-NOT: define void @_ZN2HD8d_methodEv
+void HD::h_method() {}
+// CHECK-HOST: define void @_ZN2HD8h_methodEv
+// CHECK-DEVICE-NOT: define void @_ZN2HD8h_methodEv
+
diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu
index e69473f3e84b..023ae61f3af8 100644
--- a/test/CodeGenCUDA/filter-decl.cu
+++ b/test/CodeGenCUDA/filter-decl.cu
@@ -9,12 +9,12 @@
 // CHECK-DEVICE-NOT: module asm "file scope asm is host only"
 __asm__("file scope asm is host only");
 
-// CHECK-HOST-NOT: constantdata = global
-// CHECK-DEVICE: constantdata = global
+// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-DEVICE: constantdata = externally_initialized global
 __constant__ char constantdata[256];
 
-// CHECK-HOST-NOT: devicedata = global
-// CHECK-DEVICE: devicedata = global
+// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-DEVICE: devicedata = externally_initialized global
 __device__ char devicedata[256];
 
 // CHECK-HOST-NOT: shareddata = global
diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu
new file mode 100644
index 000000000000..a12ef82773a2
--- /dev/null
+++ b/test/CodeGenCUDA/function-overload.cu
@@ -0,0 +1,214 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Make sure we handle target overloads correctly.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
+// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \
+// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+
+// Check target overloads handling with disabled call target checks.
+// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST \
+// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-HOST-NC %s
+// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \
+// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
+// RUN:    -fcuda-is-device -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \
+// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-DEVICE-NC %s
+
+#include "Inputs/cuda.h"
+
+typedef int (*fp_t)(void);
+typedef void (*gp_t)(void);
+
+// CHECK-HOST: @hp = global i32 ()* @_Z1hv
+// CHECK-HOST: @chp = global i32 ()* @ch
+// CHECK-HOST: @dhp = global i32 ()* @_Z2dhv
+// CHECK-HOST: @cdhp = global i32 ()* @cdh
+// CHECK-HOST: @gp = global void ()* @_Z1gv
+
+// CHECK-BOTH-LABEL: define i32 @_Z2dhv()
+__device__ int dh(void) { return 1; }
+// CHECK-DEVICE: ret i32 1
+__host__ int dh(void) { return 2; }
+// CHECK-HOST:   ret i32 2
+
+// CHECK-BOTH-LABEL: define i32 @_Z2hdv()
+__host__ __device__ int hd(void) { return 3; }
+// CHECK-BOTH:   ret i32 3
+
+// CHECK-DEVICE-LABEL: define i32 @_Z1dv()
+__device__ int d(void) { return 8; }
+// CHECK-DEVICE:   ret i32 8
+
+// CHECK-HOST-LABEL: define i32 @_Z1hv()
+__host__ int h(void) { return 9; }
+// CHECK-HOST:   ret i32 9
+
+// CHECK-BOTH-LABEL: define void @_Z1gv()
+__global__ void g(void) {}
+// CHECK-BOTH:   ret void
+
+// mangled names of extern "C" __host__ __device__ functions clash
+// with those of their __host__/__device__ counterparts, so
+// overloading of extern "C" functions can only happen for __host__
+// and __device__ functions -- we never codegen them in the same
+// compilation and therefore mangled name conflict is not a problem.
+
+// CHECK-BOTH-LABEL: define i32 @cdh()
+extern "C" __device__ int cdh(void) {return 10;}
+// CHECK-DEVICE:   ret i32 10
+extern "C" __host__ int cdh(void) {return 11;}
+// CHECK-HOST:     ret i32 11
+
+// CHECK-DEVICE-LABEL: define i32 @cd()
+extern "C" __device__ int cd(void) {return 12;}
+// CHECK-DEVICE:   ret i32 12
+
+// CHECK-HOST-LABEL: define i32 @ch()
+extern "C" __host__ int ch(void) {return 13;}
+// CHECK-HOST:     ret i32 13
+
+// CHECK-BOTH-LABEL: define i32 @chd()
+extern "C" __host__ __device__ int chd(void) {return 14;}
+// CHECK-BOTH:     ret i32 14
+
+// CHECK-HOST-LABEL: define void @_Z5hostfv()
+__host__ void hostf(void) {
+#if defined (NOCHECKS)
+  fp_t dp = d;   // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
+  fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp,
+#endif
+  fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp,
+  fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp,
+  fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp,
+  fp_t cdhp = cdh; // CHECK-HOST: store {{.*}} @cdh, {{.*}} %cdhp,
+  fp_t hdp = hd; // CHECK-HOST: store {{.*}} @_Z2hdv, {{.*}} %hdp,
+  fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp,
+  gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp,
+
+#if defined (NOCHECKS)
+  d();     // CHECK-HOST-NC: call i32 @_Z1dv()
+  cd();    // CHECK-HOST-NC: call i32 @cd()
+#endif
+  h();     // CHECK-HOST: call i32 @_Z1hv()
+  ch();    // CHECK-HOST: call i32 @ch()
+  dh();    // CHECK-HOST: call i32 @_Z2dhv()
+  cdh();   // CHECK-HOST: call i32 @cdh()
+  g<<<0,0>>>();  // CHECK-HOST: call void @_Z1gv()
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z7devicefv()
+__device__ void devicef(void) {
+  fp_t dp = d;   // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp,
+  fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp,
+#if defined (NOCHECKS)
+  fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
+  fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp,
+#endif
+  fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp,
+  fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp,
+  fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp,
+  fp_t chdp = chd; // CHECK-DEVICE: store {{.*}} @chd, {{.*}} %chdp,
+
+  d();     // CHECK-DEVICE: call i32 @_Z1dv()
+  cd();    // CHECK-DEVICE: call i32 @cd()
+#if defined (NOCHECKS)
+  h();     // CHECK-DEVICE-NC: call i32 @_Z1hv()
+  ch();    // CHECK-DEVICE-NC: call i32 @ch()
+#endif
+  dh();    // CHECK-DEVICE: call i32 @_Z2dhv()
+  cdh();   // CHECK-DEVICE: call i32 @cdh()
+}
+
+// CHECK-BOTH-LABEL: define void @_Z11hostdevicefv()
+__host__ __device__ void hostdevicef(void) {
+#if defined (NOCHECKS)
+  fp_t dp = d;   // CHECK-BOTH-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
+  fp_t cdp = cd; // CHECK-BOTH-NC: store {{.*}} @cd, {{.*}} %cdp,
+  fp_t hp = h; // CHECK-BOTH-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
+  fp_t chp = ch; // CHECK-BOTH-NC: store {{.*}} @ch, {{.*}} %chp,
+#endif
+  fp_t dhp = dh; // CHECK-BOTH: store {{.*}} @_Z2dhv, {{.*}} %dhp,
+  fp_t cdhp = cdh; // CHECK-BOTH: store {{.*}} @cdh, {{.*}} %cdhp,
+  fp_t hdp = hd; // CHECK-BOTH: store {{.*}} @_Z2hdv, {{.*}} %hdp,
+  fp_t chdp = chd; // CHECK-BOTH: store {{.*}} @chd, {{.*}} %chdp,
+#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
+  gp_t gp = g; // CHECK-HOST-NC: store {{.*}} @_Z1gv, {{.*}} %gp,
+#endif
+
+#if defined (NOCHECKS)
+  d();     // CHECK-BOTH-NC: call i32 @_Z1dv()
+  cd();    // CHECK-BOTH-NC: call i32 @cd()
+  h();     // CHECK-BOTH-NC: call i32 @_Z1hv()
+  ch();    // CHECK-BOTH-NC: call i32 @ch()
+#endif
+  dh();    // CHECK-BOTH: call i32 @_Z2dhv()
+  cdh();   // CHECK-BOTH: call i32 @cdh()
+#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
+  g<<<0,0>>>();  // CHECK-HOST-NC: call void @_Z1gv()
+#endif
+}
+
+// Test for address of overloaded function resolution in the global context.
+fp_t hp = h;
+fp_t chp = ch;
+fp_t dhp = dh;
+fp_t cdhp = cdh;
+gp_t gp = g;
+
+int x;
+// Check constructors/destructors for D/H functions
+struct s_cd_dh {
+  __host__ s_cd_dh() { x = 11; }
+  __device__ s_cd_dh() { x = 12; }
+  __host__ ~s_cd_dh() { x = 21; }
+  __device__ ~s_cd_dh() { x = 22; }
+};
+
+struct s_cd_hd {
+  __host__ __device__ s_cd_hd() { x = 31; }
+  __host__ __device__ ~s_cd_hd() { x = 32; }
+};
+
+// CHECK-BOTH: define void @_Z7wrapperv
+#if defined(__CUDA_ARCH__)
+__device__
+#else
+__host__
+#endif
+void wrapper() {
+  s_cd_dh scddh;
+  // CHECK-BOTH: call void @_ZN7s_cd_dhC1Ev(
+  s_cd_hd scdhd;
+  // CHECK-BOTH: call void @_ZN7s_cd_hdC1Ev
+
+  // CHECK-BOTH: call void @_ZN7s_cd_hdD1Ev(
+  // CHECK-BOTH: call void @_ZN7s_cd_dhD1Ev(
+}
+// CHECK-BOTH: ret void
+
+// Now it's time to check what's been generated for the methods we used.
+
+// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_dhC2Ev(
+// CHECK-HOST:   store i32 11,
+// CHECK-DEVICE: store i32 12,
+// CHECK-BOTH: ret void
+
+// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_hdC2Ev(
+// CHECK-BOTH:   store i32 31,
+// CHECK-BOTH: ret void
+
+// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_hdD2Ev(
+// CHECK-BOTH: store i32 32,
+// CHECK-BOTH: ret void
+
+// CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_dhD2Ev(
+// CHECK-HOST:   store i32 21,
+// CHECK-DEVICE: store i32 22,
+// CHECK-BOTH: ret void
+
diff --git a/test/CodeGenCUDA/link-device-bitcode.cu b/test/CodeGenCUDA/link-device-bitcode.cu
new file mode 100644
index 000000000000..de3d39c20b49
--- /dev/null
+++ b/test/CodeGenCUDA/link-device-bitcode.cu
@@ -0,0 +1,70 @@
+// Test for linking with CUDA's libdevice as outlined in
+// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice
+//
+// REQUIRES: nvptx-registered-target
+//
+// Prepare bitcode file to link with
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
+// RUN:    %S/Inputs/device-code.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t-2.bc \
+// RUN:    %S/Inputs/device-code-2.ll
+//
+// Make sure function in device-code gets linked in and internalized.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-cuda-bitcode %t.bc  -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR
+//
+// Make sure we can link two bitcode files.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-cuda-bitcode %t.bc -mlink-cuda-bitcode %t-2.bc \
+// RUN:    -emit-llvm -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR -check-prefix CHECK-IR-2
+//
+// Make sure function in device-code gets linked but is not internalized
+// without -fcuda-uses-libdevice
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR-NLD
+//
+// Make sure NVVMReflect pass is enabled in NVPTX back-end.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-cuda-bitcode %t.bc -S -o /dev/null %s \
+// RUN:    -backend-option -debug-pass=Structure 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-REFLECT
+
+#include "Inputs/cuda.h"
+
+__device__ float device_mul_or_add(float a, float b);
+extern "C" __device__ double __nv_sin(double x);
+extern "C" __device__ double __nv_exp(double x);
+
+// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf(
+// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf(
+__device__ void should_not_be_internalized(float *data) {}
+
+// Make sure kernel call has not been internalized.
+// CHECK-IR-LABEL: define void @_Z6kernelPfS_
+// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_(
+__global__ __attribute__((used)) void kernel(float *out, float *in) {
+  *out = device_mul_or_add(in[0], in[1]);
+  *out += __nv_exp(__nv_sin(*out));
+  should_not_be_internalized(out);
+}
+
+// Make sure device_mul_or_add() is present in IR, is internal and
+// calls __nvvm_reflect().
+// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff(
+// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff(
+// CHECK-IR: call i32 @__nvvm_reflect
+// CHECK-IR: ret float
+
+// Make sure we've linked in and internalized only needed functions
+// from the second bitcode file.
+// CHECK-IR-2-LABEL: define internal double @__nv_sin
+// CHECK-IR-2-LABEL: define internal double @__nv_exp
+// CHECK-IR-2-NOT: double @__unused
+
+// Verify that NVVMReflect pass is among the passes run by NVPTX back-end.
+// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1
diff --git a/test/CodeGenCUDA/ptx-kernels.cu b/test/CodeGenCUDA/ptx-kernels.cu
index 658b3488fc18..6280e604f2ed 100644
--- a/test/CodeGenCUDA/ptx-kernels.cu
+++ b/test/CodeGenCUDA/ptx-kernels.cu
@@ -1,3 +1,7 @@
+// Make sure that __global__ functions are emitted along with correct
+// annotations and are added to @llvm.used to prevent their elimination.
+// REQUIRES: nvptx-registered-target
+//
 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
 
 #include "Inputs/cuda.h"
@@ -13,4 +17,10 @@ __global__ void global_function() {
   device_function();
 }
 
+// Make sure host-instantiated kernels are preserved on device side.
+template <typename T> __global__ void templated_kernel(T param) {}
+// CHECK-LABEL: define weak_odr void @_Z16templated_kernelIiEvT_
+void host_function() { templated_kernel<<<0,0>>>(0); }
+
 // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
+// CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1}