29 files changed, 256 insertions, 68 deletions
diff --git a/test/CodeGen/NVPTX/annotations.ll b/test/CodeGen/NVPTX/annotations.ll
index 39d52d382663..2341377d75a6 100644
--- a/test/CodeGen/NVPTX/annotations.ll
+++ b/test/CodeGen/NVPTX/annotations.ll
@@ -33,21 +33,14 @@ define void @kernel_func_minctasm(float* %a) {
 
 !nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8}
 
-!1 = metadata !{void (float*)* @kernel_func_maxntid, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*)* @kernel_func_maxntid,
-                metadata !"maxntidx", i32 10,
-                metadata !"maxntidy", i32 20,
-                metadata !"maxntidz", i32 30}
-
-!3 = metadata !{void (float*)* @kernel_func_reqntid, metadata !"kernel", i32 1}
-!4 = metadata !{void (float*)* @kernel_func_reqntid,
-                metadata !"reqntidx", i32 11,
-                metadata !"reqntidy", i32 22,
-                metadata !"reqntidz", i32 33}
-
-!5 = metadata !{void (float*)* @kernel_func_minctasm, metadata !"kernel", i32 1}
-!6 = metadata !{void (float*)* @kernel_func_minctasm,
-                metadata !"minctasm", i32 42}
-
-!7 = metadata !{i64 addrspace(1)* @texture, metadata !"texture", i32 1}
-!8 = metadata !{i64 addrspace(1)* @surface, metadata !"surface", i32 1}
+!1 = !{void (float*)* @kernel_func_maxntid, !"kernel", i32 1}
+!2 = !{void (float*)* @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30}
+
+!3 = !{void (float*)* @kernel_func_reqntid, !"kernel", i32 1}
+!4 = !{void (float*)* @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33}
+
+!5 = !{void (float*)* @kernel_func_minctasm, !"kernel", i32 1}
+!6 = !{void (float*)* @kernel_func_minctasm, !"minctasm", i32 42}
+
+!7 = !{i64 addrspace(1)* @texture, !"texture", i32 1}
+!8 = !{i64 addrspace(1)* @surface, !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
new file mode 100644
index 000000000000..cacffceac517
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: nounwind
+define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 {
+entry:
+; CHECK-LABEL @_Z22TakesStruct1SPi
+; CHECK:   bitcast %struct.S* %input to i8*
+; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
+  %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1
+  %0 = load i32* %b, align 4
+  store i32 %0, i32* %output, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!nvvm.annotations = !{!0}
+
+!0 = !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 83d491637041..8483112381f1 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -63,4 +63,4 @@ declare void @callee(float*, i8*)
 
 !nvvm.annotations = !{!0}
 
-!0 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
+!0 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/calling-conv.ll b/test/CodeGen/NVPTX/calling-conv.ll
index 190a1462adbc..3b03442ad8bd 100644
--- a/test/CodeGen/NVPTX/calling-conv.ll
+++ b/test/CodeGen/NVPTX/calling-conv.ll
@@ -27,4 +27,4 @@ define void @metadata_kernel(float* %a) {
 
 !nvvm.annotations = !{!1}
 
-!1 = metadata !{void (float*)* @metadata_kernel, metadata !"kernel", i32 1}
+!1 = !{void (float*)* @metadata_kernel, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/fma-assoc.ll b/test/CodeGen/NVPTX/fma-assoc.ll
new file mode 100644
index 000000000000..fc04c61dd691
--- /dev/null
+++ b/test/CodeGen/NVPTX/fma-assoc.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+define ptx_device float @t1_f32(float %x, float %y, float %z,
+                                float %u, float %v) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fmul float %u, %v
+  %c = fadd float %a, %b
+  %d = fadd float %c, %z
+  ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z,
+                                 double %u, double %v) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fmul double %u, %v
+  %c = fadd double %a, %b
+  %d = fadd double %c, %z
+  ret double %d
+}
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 14b5c45b87d8..6785a01827e2 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
 ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
@@ -8,6 +11,17 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
   ret float %b
 }
 
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  %c = fadd float %a, %w
+  %d = call float @dummy_f32(float %b, float %c)
+  ret float %d
+}
+
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
 ; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
@@ -15,3 +29,14 @@ define ptx_device double @t1_f64(double %x, double %y, double %z) {
   %b = fadd double %a, %z
   ret double %b
 }
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  %c = fadd double %a, %w
+  %d = call double @dummy_f64(double %b, double %c)
+  ret double %d
+}
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 2a527989e410..fb63d6ed575f 100644
--- a/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -23,4 +23,4 @@ define void @foo(i32* %a, i32* %b) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i32*, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i32*, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll
index 1dd8ae40db4f..e3fe08e5f874 100644
--- a/test/CodeGen/NVPTX/i1-global.ll
+++ b/test/CodeGen/NVPTX/i1-global.ll
@@ -16,4 +16,4 @@ define void @foo(i1 %p, i32* %out) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i1, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll
index f4df87439322..aac71960551f 100644
--- a/test/CodeGen/NVPTX/i1-param.ll
+++ b/test/CodeGen/NVPTX/i1-param.ll
@@ -16,4 +16,4 @@ define void @foo(i1 %p, i32* %out) {
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (i1, i32*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll
index 9cc667557906..36c99b30425d 100644
--- a/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/test/CodeGen/NVPTX/ldu-i8.ll
@@ -2,15 +2,13 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)
 
 define i8 @foo(i8* %a) {
 ; Ensure we properly truncate off the high-order 24 bits
 ; CHECK:        ldu.global.u8
 ; CHECK:        cvt.u32.u16
 ; CHECK:        and.b32         %r{{[0-9]+}}, %r{{[0-9]+}}, 255
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)
   ret i8 %val
 }
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll
index 3b0619ff5175..4bfd68c22428 100644
--- a/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,40 +1,36 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
-declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
 
 
 ; CHECK: func0
 define i8 @func0(i8 addrspace(1)* %ptr) {
 ; ldu.global.u8
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func1
 define i32 @func1(i32 addrspace(1)* %ptr) {
 ; ldu.global.u32
-  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: func2
 define i8 @func2(i8 addrspace(1)* %ptr) {
 ; ld.global.nc.u8
-  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func3
 define i32 @func3(i32 addrspace(1)* %ptr) {
 ; ld.global.nc.u32
-  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
-
-
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 55707ea85106..fd35a7503901 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -7,15 +7,13 @@ define void @reg_plus_offset(i32* %a) {
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
   %p2 = getelementptr i32* %a, i32 8
-  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1
+  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4)
   %p3 = getelementptr i32* %a, i32 9
-  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1
+  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4)
   %t3 = mul i32 %t1, %t2
   store i32 %t3, i32* %a
   ret void
 }
 
-!1 = metadata !{ i32 4 }
-
-declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*, i32)
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll
new file mode 100644
index 000000000000..3614bea16534
--- /dev/null
+++ b/test/CodeGen/NVPTX/machine-sink.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4
+@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4
+
+; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates
+; BB %entry. Over-sinking created more register pressure on this example. The
+; backend would sink the fmuls to BB %merge, but not the loads for being
+; conservative on sinking memory accesses. As a result, the loads and
+; the two fmuls would be separated to two basic blocks, causing two
+; cross-BB live ranges.
+define float @post_dominate(float %x, i1 %cond) {
+; CHECK-LABEL: post_dominate(
+entry:
+  %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+; CHECK: ld.shared.f32
+; CHECK: ld.shared.f32
+  %2 = fmul float %0, %0
+  %3 = fmul float %1, %2
+; CHECK-NOT: bra
+; CHECK: mul.rn.f32
+; CHECK: mul.rn.f32
+  br i1 %cond, label %then, label %merge
+
+then:
+  %z = fadd float %x, %x
+  br label %then2
+
+then2:
+  %z2 = fadd float %z, %z
+  br label %merge
+
+merge:
+  %y = phi float [ 0.0, %entry ], [ %z2, %then2 ]
+  %w = fadd float %y, %3
+  ret float %w
+}
diff --git a/test/CodeGen/NVPTX/managed.ll b/test/CodeGen/NVPTX/managed.ll
index 4d7e7817f77b..d3f1604dbd36 100644
--- a/test/CodeGen/NVPTX/managed.ll
+++ b/test/CodeGen/NVPTX/managed.ll
@@ -8,4 +8,4 @@
 
 
 !nvvm.annotations = !{!0}
-!0 = metadata !{i32 addrspace(1)* @managed_g, metadata !"managed", i32 1}
+!0 = !{i32 addrspace(1)* @managed_g, !"managed", i32 1}
diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll
index 43bb63098f67..1ddf9739e202 100644
--- a/test/CodeGen/NVPTX/mulwide.ll
+++ b/test/CodeGen/NVPTX/mulwide.ll
@@ -23,6 +23,28 @@ define i32 @mulwideu16(i16 %a, i16 %b) {
   ret i32 %val2
 }
 
+; OPT-LABEL: @mulwide8
+; NOOPT-LABEL: @mulwide8
+define i32 @mulwide8(i8 %a, i8 %b) {
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
+  %val0 = sext i8 %a to i32
+  %val1 = sext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; OPT-LABEL: @mulwideu8
+; NOOPT-LABEL: @mulwideu8
+define i32 @mulwideu8(i8 %a, i8 %b) {
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
+  %val0 = zext i8 %a to i32
+  %val1 = zext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
 ; OPT-LABEL: @mulwide32
 ; NOOPT-LABEL: @mulwide32
 define i64 @mulwide32(i32 %a, i32 %b) {
@@ -44,3 +66,25 @@ define i64 @mulwideu32(i32 %a, i32 %b) {
   %val2 = mul i64 %val0, %val1
   ret i64 %val2
 }
+
+; OPT-LABEL: @mulwideu7
+; NOOPT-LABEL: @mulwideu7
+define i64 @mulwideu7(i7 %a, i7 %b) {
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
+  %val0 = zext i7 %a to i64
+  %val1 = zext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
+
+; OPT-LABEL: @mulwides7
+; NOOPT-LABEL: @mulwides7
+define i64 @mulwides7(i7 %a, i7 %b) {
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
+  %val0 = sext i7 %a to i64
+  %val1 = sext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 64745fcba3ba..841bbc3a517c 100644
--- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -70,5 +70,5 @@ if.end17:                                         ; preds = %if.else13, %if.then
 ; Function Attrs: noduplicate nounwind
 declare void @llvm.cuda.syncthreads() #2
 
-!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
-!1 = metadata !{null, metadata !"align", i32 8}
+!0 = !{void (float*)* @foo, !"kernel", i32 1}
+!1 = !{null, !"align", i32 8}
diff --git a/test/CodeGen/NVPTX/nvcl-param-align.ll b/test/CodeGen/NVPTX/nvcl-param-align.ll
new file mode 100644
index 000000000000..c1a489f1fc42
--- /dev/null
+++ b/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+; CHECK-LABEL: .entry foo(
+define void @foo(i64 %img, i64 %sampler, <5 x float>* %v) {
+; The parameter alignment should be the next power of 2 of 5xsizeof(float),
+; which is 32.
+; CHECK: .param .u32 .ptr .align 32 foo_param_2
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = !{void (i64, i64, <5 x float>*)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i64, <5 x float>*)* @foo, !"rdoimage", i32 0}
+!3 = !{void (i64, i64, <5 x float>*)* @foo, !"sampler", i32 1}
diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll
index 4aeff0924955..e8782ea3aa27 100644
--- a/test/CodeGen/NVPTX/refl1.ll
+++ b/test/CodeGen/NVPTX/refl1.ll
@@ -36,4 +36,4 @@ attributes #2 = { alwaysinline inlinehint nounwind readnone }
 
 !nvvm.annotations = !{!0}
 
-!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
+!0 = !{void (float*)* @foo, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
index ab6f423cd80a..1b41361cf7ed 100644
--- a/test/CodeGen/NVPTX/simple-call.ll
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -23,4 +23,4 @@ define void @kernel_func(float* %a) {
 
 !nvvm.annotations = !{!1}
 
-!1 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
+!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll
index 10a1ecc4c473..ed021346c0f9 100644
--- a/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -47,7 +47,7 @@ define void @bar(float* %red, i32 %idx) {
 
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
 
diff --git a/test/CodeGen/NVPTX/surf-read.ll b/test/CodeGen/NVPTX/surf-read.ll
index a69d03efe0d2..7383722a3596 100644
--- a/test/CodeGen/NVPTX/surf-read.ll
+++ b/test/CodeGen/NVPTX/surf-read.ll
@@ -16,5 +16,5 @@ define void @foo(i64 %img, float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, float*, i32)* @foo, metadata !"rdwrimage", i32 0}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, float*, i32)* @foo, !"rdwrimage", i32 0}
diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll
index 654c47f46957..da55a242bba6 100644
--- a/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -36,7 +36,7 @@ define void @bar(i32 %val, i32 %idx) {
 
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i32, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
 
diff --git a/test/CodeGen/NVPTX/surf-write.ll b/test/CodeGen/NVPTX/surf-write.ll
index 880231f96599..5098d2ae9e1c 100644
--- a/test/CodeGen/NVPTX/surf-write.ll
+++ b/test/CodeGen/NVPTX/surf-write.ll
@@ -12,5 +12,5 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, i32, i32)* @foo, metadata !"wroimage", i32 0}
+!1 = !{void (i64, i32, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i32, i32)* @foo, !"wroimage", i32 0}
diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll
index ee0cefa919b1..c5b5600de874 100644
--- a/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -41,6 +41,6 @@ define void @bar(float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
-!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
+!1 = !{void (i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (float*, i32)* @bar, !"kernel", i32 1}
+!3 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
index 55e4bfc9e453..6e0fda69e4f5 100644
--- a/test/CodeGen/NVPTX/tex-read.ll
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -15,6 +15,6 @@ define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
 }
 
 !nvvm.annotations = !{!1, !2, !3}
-!1 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"kernel", i32 1}
-!2 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"rdoimage", i32 0}
-!3 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"sampler", i32 1}
+!1 = !{void (i64, i64, float*, i32)* @foo, !"kernel", i32 1}
+!2 = !{void (i64, i64, float*, i32)* @foo, !"rdoimage", i32 0}
+!3 = !{void (i64, i64, float*, i32)* @foo, !"sampler", i32 1}
diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll
index c7637ccff77a..e56eb5dea18f 100644
--- a/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -99,5 +99,5 @@ define i32 @s3() {
 
 
 !nvvm.annotations = !{!1, !2}
-!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
-!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+!1 = !{i64 addrspace(1)* @tex0, !"texture", i32 1}
+!2 = !{i64 addrspace(1)* @surf0, !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/vector-global.ll b/test/CodeGen/NVPTX/vector-global.ll
new file mode 100644
index 000000000000..a463bee3a479
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-global.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+@g1 = external global <4 x i32> ; external global variable
+; CHECK: .extern .global .align 16 .b8 g1[16];
+@g2 = global <4 x i32> zeroinitializer ; module-level global variable
+; CHECK: .visible .global .align 16 .b8 g2[16];
diff --git a/test/CodeGen/NVPTX/vector-return.ll b/test/CodeGen/NVPTX/vector-return.ll
new file mode 100644
index 000000000000..15e50f8e1443
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-return.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+declare <2 x float> @bar(<2 x float> %input)
+
+define void @foo(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @foo
+entry:
+  %call = tail call <2 x float> @bar(<2 x float> %input)
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0];
+  store <2 x float> %call, <2 x float>* %output, align 8
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]}
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/weak-linkage.ll b/test/CodeGen/NVPTX/weak-linkage.ll
index 7a1335783642..5df57b29249e 100644
--- a/test/CodeGen/NVPTX/weak-linkage.ll
+++ b/test/CodeGen/NVPTX/weak-linkage.ll
@@ -1,11 +1,17 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
-
+; CHECK: // .weak foo
 ; CHECK: .weak .func foo
 define weak void @foo() {
   ret void
 }
 
+; CHECK: // .weak baz
+; CHECK: .weak .func baz
+define weak_odr void @baz() {
+  ret void
+}
+
 ; CHECK: .visible .func bar
 define void @bar() {
   ret void