522 files changed, 35775 insertions, 12142 deletions
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index c7bcfd2ddab2..ff8c90457876 100644
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
@@ -91,12 +91,12 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 
 
 @ptr = addrspace(3) global i32 addrspace(3)* undef
-@dst = addrspace(3) global [16384 x i32] undef
+@dst = addrspace(3) global [16383 x i32] undef
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
 define void @global_ptr() nounwind {
-  store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
new file mode 100644
index 000000000000..62b09dfedf15
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
+; REQUIRES: global-isel
+; This file checks that the translation from llvm IR to generic MachineInstr
+; is correct.
+
+; Tests for add.
+; CHECK: name: addi32
+; CHECK: G_ADD i32
+define i32 @addi32(i32 %arg1, i32 %arg2) {
+  %res = add i32 %arg1, %arg2
+  ret i32 %res
+}
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 2ddfa9649ac9..f37247361ece 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test1:
 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -123,12 +123,11 @@ entry:
 ; SI: s_add_u32
 ; SI: s_addc_u32
 
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-DAG: ADDC_UINT
 ; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
@@ -145,12 +144,11 @@ entry:
 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
 ; SI-NOT: v_addc_u32_e32 s
 
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-DAG: ADDC_UINT
 ; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
@@ -165,12 +163,11 @@ entry:
 ; SI: s_add_u32
 ; SI: s_addc_u32
 
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-DAG: ADDC_UINT
 ; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 8346add7df97..3d360b7d0b7a 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,13 +1,13 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
 
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
   %a = load i64, i64 addrspace(1)* %a_ptr
@@ -59,7 +59,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_add_i32
 ; SI: v_addc_u32
 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
deleted file mode 100644
index 3aa2f653bf9c..000000000000
--- a/test/CodeGen/AMDGPU/address-space.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; Test that codegenprepare understands address space sizes
-
-%struct.foo = type { [3 x float], [3 x float] }
-
-; CHECK-LABEL: {{^}}do_as_ptr_calcs:
-; CHECK: s_load_dword [[SREG1:s[0-9]+]],
-; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
-define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
-entry:
-  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
-  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
-  br label %bb32
-
-bb32:
-  %a = load float, float addrspace(3)* %x, align 4
-  %b = load float, float addrspace(3)* %y, align 4
-  %cmp = fcmp one float %a, %b
-  br i1 %cmp, label %bb34, label %bb33
-
-bb33:
-  unreachable
-
-bb34:
-  unreachable
-}
-
-
diff --git a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
new file mode 100644
index 000000000000..67a193999204
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -0,0 +1,106 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
+
+declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrspace(4)* nocapture, i32, i32, i1) #0
+
+@lds.i32 = unnamed_addr addrspace(3) global i32 undef, align 4
+@lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4
+
+@global.i32 = unnamed_addr addrspace(1) global i32 undef, align 4
+@global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
+
+; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
+define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+  store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+  ret void
+}
+
+; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
+define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
+  ret void
+}
+
+; HSA: define void @store_constant_cast_group_gv_to_flat() #2
+define void @store_constant_cast_group_gv_to_flat() #1 {
+  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
+  ret void
+}
+
+; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
+define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+  store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+  ret void
+}
+
+; HSA: @store_constant_cast_global_gv_to_flat() #1
+define void @store_constant_cast_global_gv_to_flat() #1 {
+  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
+  ret void
+}
+
+; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
+define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+  store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+  ret void
+}
+
+; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+  %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+  %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+  %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, i32 addrspace(1)* %out
+  ret void
+}
+
+; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+  call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
+  ret void
+}
+
+; Can't just search the pointer value
+; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
+define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+  store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
+  ret void
+}
+
+; Can't just search pointer types
+; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
+define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+  store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
+  ret void
+}
+
+; Cast group to flat, do GEP, cast back to group
+; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
+define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+  store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
+  ret void
+}
+
+; HSA: @ret_constant_cast_group_gv_gep_to_flat_to_group() #2
+define i32 addrspace(3)* @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+  ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
+}
+
+; HSA: attributes #0 = { argmemonly nounwind }
+; HSA: attributes #1 = { nounwind }
+; HSA: attributes #2 = { nounwind "amdgpu-queue-ptr" }
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 61bcd4b3c093..5a173e954f8d 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,18 +1,208 @@
-; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
 
-; ERROR: unsupported addrspacecast not implemented
+; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
 
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 7, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
+
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+  %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+  store volatile i32 7, i32 addrspace(4)* %stof
+  ret void
+}
+
+; no-op
+; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+  %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+  store volatile i32 7, i32 addrspace(4)* %stof
+  ret void
+}
+
+; no-op
+; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+  %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+  %ld = load volatile i32, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+  store volatile i32 0, i32 addrspace(3)* %ftos
+  ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+  store volatile i32 0, i32* %ftos
+  ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+  store volatile i32 0, i32 addrspace(1)* %ftos
+  ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+  load volatile i32, i32 addrspace(2)* %ftos
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_group_to_flat_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
+  store i32 7, i32 addrspace(4)* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_0_flat_to_group_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
+  store i32 7, i32 addrspace(3)* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
+  store i32 7, i32 addrspace(4)* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
+  store i32 7, i32 addrspace(3)* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_private_to_flat_addrspacecast() #0 {
+  %cast = addrspacecast i32* null to i32 addrspace(4)*
+  store i32 7, i32 addrspace(4)* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+define void @cast_0_flat_to_private_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
+  store i32 7, i32* %cast
+  ret void
+}
 
 ; Disable optimizations in case there are optimizations added that
 ; specialize away generic pointer accesses.
 
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
+; HSA-LABEL: {{^}}branch_use_flat_i32:
+; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
+; HSA: s_endpgm
 define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
@@ -34,33 +224,30 @@ end:
   ret void
 }
 
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
 ; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
+; HSA-LABEL: {{^}}store_flat_scratch:
+; HSA-DAG: s_mov_b32 flat_scratch_lo, s9
+; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
+; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; HSA: flat_store_dword
+; HSA: s_barrier
+; HSA: flat_load_dword
 define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
-  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x
   %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
   store i32 %x, i32 addrspace(4)* %fptr
   ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() #1
+  call void @llvm.amdgcn.s.barrier() #1
   %reload = load i32, i32 addrspace(4)* %fptr, align 4
   store i32 %reload, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-declare void @llvm.AMDGPU.barrier.local() #1
-declare i32 @llvm.r600.read.tidig.x() #3
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind convergent }
-attributes #3 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
new file mode 100644
index 000000000000..ad6843770fd6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
+; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s
+; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
+; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck  -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s
+; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
+; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN  -check-prefix=GCN-ALLOCA %s
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
+
+; Make sure we don't overwrite workitem information with private memory
+
+; GCN-LABEL: {{^}}work_item_info:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
+  %5 = call i32 @llvm.amdgcn.workitem.id.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll
new file mode 100644
index 000000000000..b1b3b9930d1f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll
@@ -0,0 +1,114 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}workdim:
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+define void @workdim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; The workgroup.id values are stored in sgprs offset by the number of user
+; sgprs.
+
+; FUNC-LABEL: {{^}}workgroup_id_x:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @workgroup_id_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workgroup.id.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_id_y:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+define void @workgroup_id_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workgroup.id.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_id_z:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @workgroup_id_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workgroup.id.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_x:
+; GCN-NOHSA: buffer_store_dword v0
+define void @workitem_id_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_y:
+
+; GCN-NOHSA: buffer_store_dword v1
+define void @workitem_id_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workitem.id.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_z:
+; GCN-NOHSA: buffer_store_dword v2
+define void @workitem_id_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.amdgcn.workitem.id.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i32 @llvm.amdgcn.read.workdim() #0
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
new file mode 100644
index 000000000000..a12132f425d9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -0,0 +1,8 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare < %s
+; Make sure this doesn't crash with no triple
+
+; CHECK-LABEL: @foo(
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
new file mode 100644
index 000000000000..dd16907b748c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}shader_cc:
+; GCN: v_add_i32_e32 v0, vcc, s8, v0
+define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
+
+; GCN-LABEL: {{^}}kernel_cc:
+; GCN: s_endpgm
+define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
new file mode 100644
index 000000000000..7b5158629091
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -0,0 +1,530 @@
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
+
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+
+
+; HSAOPT: @mova_same_clause.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; HSAOPT: @high_alignment.stack = internal unnamed_addr addrspace(3) global [256 x [8 x i32]] undef, align 16
+
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+; OPT-LABEL: @mova_same_clause(
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; HSA-PROMOTE: .amd_kernel_code_t
+; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
+; HSA-PROMOTE: .end_amd_kernel_code_t
+
+; FIXME: These should be merged
+; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
+; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; HSA-ALLOCA: .amd_kernel_code_t
+; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
+; by 4 bytes.
+; HSA-ALLOCA: workitem_private_segment_byte_size = 24
+; HSA-ALLOCA: .end_amd_kernel_code_t
+
+; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7
+; HSA-ALLOCA: s_add_u32 s6, s6, s9
+; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+
+
+; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)*
+; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1
+; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0
+; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2
+; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
+; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
+
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
+
+; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
+; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
+; HSAOPT: [[Y_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[WORKITEM_ID_Y]], [[LDZU]]
+; HSAOPT: [[ADD_YZ_X_X_YZ_SIZE:%[0-9]+]] = add i32 [[YZ_X_XID]], [[Y_X_Z_SIZE]]
+; HSAOPT: [[ADD_ZID:%[0-9]+]] = add i32 [[ADD_YZ_X_X_YZ_SIZE]], [[WORKITEM_ID_Z]]
+
+; HSAOPT: [[LOCAL_GEP:%[0-9]+]] = getelementptr inbounds [256 x [5 x i32]], [256 x [5 x i32]] addrspace(3)* @mova_same_clause.stack, i32 0, i32 [[ADD_ZID]]
+; HSAOPT: %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}}
+; HSAOPT: %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}}
+; HSAOPT: %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 0
+; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 1
+
+
+; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
+; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; OPT-LABEL: @high_alignment(
+; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
+define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [8 x i32], align 16
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_replace_inbounds_gep:
+; OPT-LABEL: @no_replace_inbounds_gep(
+; OPT: alloca [5 x i32]
+
+; SI-NOT: ds_write
+define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; OPT-LABEL: @multiple_structs(
+
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32, i32* %a.indirect.ptr
+  %b.indirect = load i32, i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32, i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %b = load i32, i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32, i32* %x_ptr
+  %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32, i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32, i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16, i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8, i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %0 = alloca [3 x i8], align 1
+  %1 = alloca [2 x i8], align 1
+  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
+  store i8 0, i8* %2
+  store i8 1, i8* %3
+  store i8 2, i8* %4
+  store i8 1, i8* %5
+  store i8 0, i8* %6
+  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8, i8* %7
+  %10 = load i8, i8* %8
+  %11 = add i8 %9, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i8]]
+  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep0
+  store i8 1, i8* %gep1
+  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8, i8* %gep2
+  %sext = sext i8 %load to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i64]]
+  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 1, i64* %gep1
+  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x %struct.pair32]]
+  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x %struct.pair32]
+  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %cmp = icmp eq i32 %in, 0
+  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+  %load = load i32, i32* %sel
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
+  %tmp5 = load i32, i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @pointer_typed_alloca(
+; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
+; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
+define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4
+  %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4
+  %ld2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %ld2, i32 2
+  store i32 3, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+; HSAOPT: !0 = !{}
+; HSAOPT: !1 = !{i32 0, i32 2048}
+
+; NOHSAOPT: !0 = !{i32 0, i32 2048}
+
+
+; FUNC-LABEL: v16i32_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+  %alloca = alloca [2 x <16 x i32>]
+  %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
+  %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
+  store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: v16float_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+  %alloca = alloca [2 x <16 x float>]
+  %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
+  %tmp5 = load <16 x float>, <16 x float>* %tmp0
+  store <16 x float> %tmp5, <16 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: v2float_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+  %alloca = alloca [16 x <2 x float>]
+  %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
+  %tmp5 = load <2 x float>, <2 x float>* %tmp0
+  store <2 x float> %tmp5, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e7fcd1ff3650..853788b92aae 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -1,34 +1,32 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+; Legacy intrinsics that just read implicit parameters
 
-; FUNC-LABEL: {{^}}ngroups_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-
-; HSA: .amd_kernel_code_t
-
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 0
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: enable_sgpr_dispatch_id = 0
-; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: enable_sgpr_private_segment_size = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-
-; HSA: .end_amd_kernel_code_t
+; FUNC-LABEL: {{^}}workdim_legacy:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].Z
+define void @workdim_legacy (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
 
-; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; FUNC-LABEL: {{^}}ngroups_x:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -37,13 +35,13 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}ngroups_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -52,13 +50,13 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}ngroups_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -67,13 +65,13 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}global_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -82,13 +80,13 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}global_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -97,13 +95,13 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}global_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -111,97 +109,94 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}local_size_x:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
+define void @local_size_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
+define void @local_size_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
+define void @local_size_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Legacy use of r600 intrinsics by GCN
+
 ; The tgid values are stored in sgprs offset by the number of user
 ; sgprs.
 
-; FUNC-LABEL: {{^}}tgid_x:
-; HSA: .amd_kernel_code_t
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 0
-; HSA: compute_pgm_rsrc2_tgid_z_en = 0
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-; HSA: .end_amd_kernel_code_t
-
+; FUNC-LABEL: {{^}}tgid_x_legacy:
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
-; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
 
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
 ; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x(i32 addrspace(1)* %out) {
+define void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}tgid_y:
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 1
-; HSA: compute_pgm_rsrc2_tgid_z_en = 0
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; FUNC-LABEL: {{^}}tgid_y_legacy:
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
-; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
 
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
-; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_y(i32 addrspace(1)* %out) {
+define void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}tgid_z:
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 0
-; HSA: compute_pgm_rsrc2_tgid_z_en = 1
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 0
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: enable_sgpr_dispatch_id = 0
-; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: enable_sgpr_private_segment_size = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-
+; FUNC-LABEL: {{^}}tgid_z_legacy:
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
-; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
 
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
 ; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z(i32 addrspace(1)* %out) {
+define void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -212,11 +207,9 @@ entry:
 ; GCN-NOHSA: .long 47180
 ; GCN-NOHSA-NEXT: .long 132{{$}}
 
-; FUNC-LABEL: {{^}}tidig_x:
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-; HSA: flat_store_dword v0
-define void @tidig_x(i32 addrspace(1)* %out) {
+define void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -227,12 +220,10 @@ entry:
 ; GCN-NOHSA: .long 47180
 ; GCN-NOHSA-NEXT: .long 2180{{$}}
 
-; FUNC-LABEL: {{^}}tidig_y:
+; FUNC-LABEL: {{^}}tidig_y_legacy:
 
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
 ; GCN-NOHSA: buffer_store_dword v1
-; HSA: flat_store_dword v1
-define void @tidig_y(i32 addrspace(1)* %out) {
+define void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -243,11 +234,9 @@ entry:
 ; GCN-NOHSA: .long 47180
 ; GCN-NOHSA-NEXT: .long 4228{{$}}
 
-; FUNC-LABEL: {{^}}tidig_z:
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
+; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-; HSA: flat_store_dword v2
-define void @tidig_z(i32 addrspace(1)* %out) {
+define void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -262,6 +251,10 @@ declare i32 @llvm.r600.read.global.size.x() #0
 declare i32 @llvm.r600.read.global.size.y() #0
 declare i32 @llvm.r600.read.global.size.z() #0
 
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
 declare i32 @llvm.r600.read.tgid.z() #0
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
new file mode 100644
index 000000000000..dde5f8c21769
--- /dev/null
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+entry:
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  br label %endif
+
+endif:
+  %tmp1 = phi i64 [%and, %if], [0, %entry]
+  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index f83fb16101fb..0046bc93826e 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
@@ -177,50 +177,78 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_constant_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}s_and_constant_i64:
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
+; SI: buffer_store_dwordx2
 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
-  %and = and i64 %a, 281474976710655
+  %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_and_i64:
-; SI: v_and_b32
-; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
-  %and = and i64 %a, %b
+; FUNC-LABEL: {{^}}s_and_multi_use_constant_i64:
+; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
+; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
+define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %and0 = and i64 %a, 549756338176
+  %and1 = and i64 %b, 549756338176
+  store volatile i64 %and0, i64 addrspace(1)* %out
+  store volatile i64 %and1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64:
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
+; SI-NOT: and
+; SI: buffer_store_dwordx2
+define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_and_i64_br:
+; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
+; SI: s_load_dword [[A:s[0-9]+]]
+; SI: s_load_dword [[B:s[0-9]+]]
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_lshl_b32 [[A]], [[A]], 1
+; SI: s_lshl_b32 [[B]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI-NOT: and
+; SI: buffer_store_dwordx2
+define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+  %shl.a = shl i64 %a, 1
+  %shl.b = shl i64 %b, 1
+  %and0 = and i64 %shl.a, 62
+  %and1 = and i64 %shl.b, 62
+  %add0 = add i64 %and0, %c
+  %add1 = add i64 %and1, %c
+  store volatile i64 %add0, i64 addrspace(1)* %out
+  store volatile i64 %add1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
-entry:
-  %tmp0 = icmp eq i32 %cond, 0
-  br i1 %tmp0, label %if, label %endif
-
-if:
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
-  br label %endif
-
-endif:
-  %tmp1 = phi i64 [%and, %if], [0, %entry]
-  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_and_constant_i64:
-; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207
-; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}}
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
@@ -229,10 +257,54 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
   ret void
 }
 
-; FIXME: Should replace and 0
+; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
+; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load volatile i64, i64 addrspace(1)* %aptr
+  %b = load volatile i64, i64 addrspace(1)* %aptr
+  %and0 = and i64 %a, 1231231234567
+  %and1 = and i64 %b, 1231231234567
+  store volatile i64 %and0, i64 addrspace(1)* %out
+  store volatile i64 %and1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_multi_use_inline_imm_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI-NOT: and
+; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI-NOT: and
+; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
+; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
+define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load volatile i64, i64 addrspace(1)* %aptr
+  %b = load volatile i64, i64 addrspace(1)* %aptr
+  %and0 = and i64 %a, 63
+  %and1 = and i64 %b, 63
+  store volatile i64 %and0, i64 addrspace(1)* %out
+  store volatile i64 %and1, i64 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: and
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
@@ -240,10 +312,12 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
   ret void
 }
 
-; FIXME: Replace and 0 with mov 0
 ; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOT: and
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
@@ -252,15 +326,39 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+; SI: s_load_dword
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
+; SI-NOT: and
+; SI: buffer_store_dword
 define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
+; SI: s_load_dword [[A:s[0-9]+]]
+; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI-NOT: and
+; SI: s_add_u32
+; SI-NEXT: s_addc_u32
+define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+  %shl = shl i64 %a, 1
+  %and = and i64 %shl, 64
+  %add = add i64 %and, %b
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +366,14 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -276,7 +381,14 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -284,47 +396,85 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 }
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -335,22 +485,26 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; Test with the 64-bit integer bitpattern for a 32-bit float in the
 ; low 32-bits, which is not a valid 64-bit inline immmediate.
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FIXME: Copy of -1 register
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
-; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -358,20 +512,25 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 }
 
 ; Shift into upper 32-bits
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
 define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
new file mode 100644
index 000000000000..084a6933da26
--- /dev/null
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -0,0 +1,238 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+
+; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.y()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+  %val3 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %val4 = call i32 @llvm.amdgcn.workgroup.id.y()
+  %val5 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* %ptr
+  store volatile i32 %val1, i32 addrspace(1)* %ptr
+  store volatile i32 %val2, i32 addrspace(1)* %ptr
+  store volatile i32 %val3, i32 addrspace(1)* %ptr
+  store volatile i32 %val4, i32 addrspace(1)* %ptr
+  store volatile i32 %val5, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %val = load i32, i32 addrspace(2)* %bc
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %val = load i32, i32 addrspace(2)* %bc
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+  %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+  store volatile i32 0, i32 addrspace(3)* %ftos
+  ret void
+}
+
+; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+  store volatile i32 0, i32* %ftos
+  ret void
+}
+
+; No-op addrspacecast should not use queue ptr
+; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+  %ld = load volatile i32, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+  store volatile i32 0, i32 addrspace(1)* %ftos
+  ret void
+}
+
+; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+  %ld = load volatile i32, i32 addrspace(2)* %ftos
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+; HSA: attributes #0 = { nounwind readnone }
+; HSA: attributes #1 = { nounwind }
+; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" }
+; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" }
+; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
+; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index b116c72322bb..a4e7bb67d507 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -1,5 +1,4 @@
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s
-; RUN: opt -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s
 
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
@@ -13,11 +12,6 @@ declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-declare i32 @llvm.r600.read.global.size.x() #0
-declare i32 @llvm.r600.read.global.size.y() #0
-declare i32 @llvm.r600.read.global.size.z() #0
-
-
 ; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
 define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index f8a74222d566..b00fff0a6f9a 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -1,8 +1,9 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
-declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind convergent
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+declare void @llvm.amdgcn.s.barrier() #2
 
 ; The required pointer calculations for the alloca'd actually requires
 ; an add and won't be folded into the addressing, which fails with a
@@ -14,31 +15,38 @@ declare void @llvm.AMDGPU.barrier.local() nounwind convergent
 ; FIXME: We end up with zero argument for ADD, because
 ; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
 ; with the appropriate offset.  We should fold this into the store.
+
 ; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
+; SI-ALLOCA: s_barrier
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
 ;
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how
 ; to interpret:
-; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+; getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
 
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
-  %alloca = alloca [4 x i32], i32 4, align 16
-  %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+  %alloca = alloca [16 x i32], align 16
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
+  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
+  %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid
   %a = load i32, i32 addrspace(1)* %a_ptr
   %b = load i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
-  %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+  %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
   store i32 %result, i32* %alloca_ptr, align 4
   ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() nounwind convergent
+  call void @llvm.amdgcn.s.barrier()
   %reload = load i32, i32* %alloca_ptr, align 4
-  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
   ret void
 }
 
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index a3ae3c3aea16..b914edf2928e 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -1,13 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.SI.tid() readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; SI-LABEL: {{^}}test_array_ptr_calc:
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
 define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.SI.tid() readnone
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
   %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
   %a = load i32, i32 addrspace(1)* %a_ptr
@@ -16,3 +18,5 @@ define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] ad
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index ef2560ef1849..6a2716cc903e 100644
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -3,11 +3,11 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
@@ -21,12 +21,12 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
@@ -75,8 +75,8 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
 ; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
index 4c6f45525b9e..184d07ffad9c 100644
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index abdc4afef472..ff730a085255 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,9 +1,23 @@
-; XFAIL: *
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
 
-; CHECK-LABEL: {{^}}test_branch(
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+; GCN-LABEL: {{^}}test_branch:
+; GCNNOOPT: v_writelane_b32
+; GCNNOOPT: v_writelane_b32
+; GCNNOOPT: v_writelane_b32
+; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#1
+; GCNNOOPT: v_readlane_b32
+; GCNNOOPT: v_readlane_b32
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+; GCN: {{^}}[[END]]
+; GCN: s_endpgm
+define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -14,3 +28,28 @@ store:
 end:
   ret void
 }
+
+; GCN-LABEL: {{^}}test_brcc_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1,
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+; GCN: {{^}}[[END]]
+; GCN: s_endpgm
+define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+  %cmp0 = icmp ne i1 %val, 0
+  br i1 %cmp0, label %store, label %end
+
+store:
+  store i32 222, i32 addrspace(1)* %out
+  ret void
+
+end:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll
new file mode 100644
index 000000000000..73db87d7ae9e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfm.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}bfm_pattern:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  %c = shl i32 %b, %y
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern_simple:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
+define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  store i32 %b, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
index 2671c5d102b3..0ab22b350f50 100644
--- a/test/CodeGen/AMDGPU/big_alu.ll
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -1,1173 +1,1312 @@
-;RUN: llc < %s -march=r600 -mcpu=cedar
+; RUN: llc -march=r600 -mcpu=cedar < %s
 
-;This test ensures that R600 backend can handle ifcvt properly
-;and do not generate ALU clauses with more than 128 instructions.
+; This test ensures that R600 backend can handle ifcvt properly
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) {
 main_body:
-  %0 = extractelement <4 x float> %reg0, i32 0
-  %1 = extractelement <4 x float> %reg0, i32 1
-  %2 = extractelement <4 x float> %reg0, i32 2
-  %3 = extractelement <4 x float> %reg0, i32 3
-  %4 = extractelement <4 x float> %reg1, i32 0
-  %5 = extractelement <4 x float> %reg9, i32 0
-  %6 = extractelement <4 x float> %reg8, i32 0
-  %7 = fcmp ugt float %6, 0.000000e+00
-  %8 = select i1 %7, float %4, float %5
-  %9 = extractelement <4 x float> %reg1, i32 1
-  %10 = extractelement <4 x float> %reg9, i32 1
-  %11 = extractelement <4 x float> %reg8, i32 0
-  %12 = fcmp ugt float %11, 0.000000e+00
-  %13 = select i1 %12, float %9, float %10
-  %14 = extractelement <4 x float> %reg1, i32 2
-  %15 = extractelement <4 x float> %reg9, i32 2
-  %16 = extractelement <4 x float> %reg8, i32 0
-  %17 = fcmp ugt float %16, 0.000000e+00
-  %18 = select i1 %17, float %14, float %15
-  %19 = extractelement <4 x float> %reg1, i32 3
-  %20 = extractelement <4 x float> %reg9, i32 3
-  %21 = extractelement <4 x float> %reg8, i32 0
-  %22 = extractelement <4 x float> %reg2, i32 0
-  %23 = extractelement <4 x float> %reg2, i32 1
-  %24 = extractelement <4 x float> %reg2, i32 2
-  %25 = extractelement <4 x float> %reg2, i32 3
-  %26 = extractelement <4 x float> %reg3, i32 0
-  %27 = extractelement <4 x float> %reg3, i32 1
-  %28 = extractelement <4 x float> %reg3, i32 2
-  %29 = extractelement <4 x float> %reg3, i32 3
-  %30 = extractelement <4 x float> %reg4, i32 0
-  %31 = extractelement <4 x float> %reg4, i32 1
-  %32 = extractelement <4 x float> %reg4, i32 2
-  %33 = extractelement <4 x float> %reg4, i32 3
-  %34 = extractelement <4 x float> %reg5, i32 0
-  %35 = extractelement <4 x float> %reg5, i32 1
-  %36 = extractelement <4 x float> %reg5, i32 2
-  %37 = extractelement <4 x float> %reg5, i32 3
-  %38 = extractelement <4 x float> %reg6, i32 0
-  %39 = extractelement <4 x float> %reg6, i32 1
-  %40 = extractelement <4 x float> %reg6, i32 2
-  %41 = extractelement <4 x float> %reg6, i32 3
-  %42 = extractelement <4 x float> %reg7, i32 0
-  %43 = extractelement <4 x float> %reg7, i32 1
-  %44 = extractelement <4 x float> %reg7, i32 2
-  %45 = extractelement <4 x float> %reg7, i32 3
-  %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %47 = extractelement <4 x float> %46, i32 0
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %49 = extractelement <4 x float> %48, i32 1
-  %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %51 = extractelement <4 x float> %50, i32 2
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %53 = extractelement <4 x float> %52, i32 0
-  %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %55 = extractelement <4 x float> %54, i32 0
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %57 = extractelement <4 x float> %56, i32 1
-  %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %59 = extractelement <4 x float> %58, i32 2
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %61 = extractelement <4 x float> %60, i32 3
-  %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %63 = extractelement <4 x float> %62, i32 0
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %65 = extractelement <4 x float> %64, i32 1
-  %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %67 = extractelement <4 x float> %66, i32 2
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %69 = extractelement <4 x float> %68, i32 0
-  %70 = fcmp oge float %69, 3.500000e+00
-  %71 = sext i1 %70 to i32
-  %72 = bitcast i32 %71 to float
-  %73 = bitcast float %72 to i32
-  %74 = icmp ne i32 %73, 0
-  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
-  %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %76 = extractelement <4 x float> %75, i32 0
-  %77 = fcmp oge float %76, 2.000000e+00
-  %78 = sext i1 %77 to i32
-  %79 = bitcast i32 %78 to float
-  %80 = bitcast float %79 to i32
-  %81 = icmp ne i32 %80, 0
-  br i1 %81, label %IF137, label %ENDIF136
+  %tmp = extractelement <4 x float> %reg0, i32 0
+  %tmp1 = extractelement <4 x float> %reg0, i32 1
+  %tmp2 = extractelement <4 x float> %reg0, i32 2
+  %tmp3 = extractelement <4 x float> %reg0, i32 3
+  %tmp4 = extractelement <4 x float> %reg1, i32 0
+  %tmp5 = extractelement <4 x float> %reg9, i32 0
+  %tmp6 = extractelement <4 x float> %reg8, i32 0
+  %tmp7 = fcmp ugt float %tmp6, 0.000000e+00
+  %tmp8 = select i1 %tmp7, float %tmp4, float %tmp5
+  %tmp9 = extractelement <4 x float> %reg1, i32 1
+  %tmp10 = extractelement <4 x float> %reg9, i32 1
+  %tmp11 = extractelement <4 x float> %reg8, i32 0
+  %tmp12 = fcmp ugt float %tmp11, 0.000000e+00
+  %tmp13 = select i1 %tmp12, float %tmp9, float %tmp10
+  %tmp14 = extractelement <4 x float> %reg1, i32 2
+  %tmp15 = extractelement <4 x float> %reg9, i32 2
+  %tmp16 = extractelement <4 x float> %reg8, i32 0
+  %tmp17 = fcmp ugt float %tmp16, 0.000000e+00
+  %tmp18 = select i1 %tmp17, float %tmp14, float %tmp15
+  %tmp19 = extractelement <4 x float> %reg1, i32 3
+  %tmp20 = extractelement <4 x float> %reg9, i32 3
+  %tmp21 = extractelement <4 x float> %reg8, i32 0
+  %tmp22 = extractelement <4 x float> %reg2, i32 0
+  %tmp23 = extractelement <4 x float> %reg2, i32 1
+  %tmp24 = extractelement <4 x float> %reg2, i32 2
+  %tmp25 = extractelement <4 x float> %reg2, i32 3
+  %tmp26 = extractelement <4 x float> %reg3, i32 0
+  %tmp27 = extractelement <4 x float> %reg3, i32 1
+  %tmp28 = extractelement <4 x float> %reg3, i32 2
+  %tmp29 = extractelement <4 x float> %reg3, i32 3
+  %tmp30 = extractelement <4 x float> %reg4, i32 0
+  %tmp31 = extractelement <4 x float> %reg4, i32 1
+  %tmp32 = extractelement <4 x float> %reg4, i32 2
+  %tmp33 = extractelement <4 x float> %reg4, i32 3
+  %tmp34 = extractelement <4 x float> %reg5, i32 0
+  %tmp35 = extractelement <4 x float> %reg5, i32 1
+  %tmp36 = extractelement <4 x float> %reg5, i32 2
+  %tmp37 = extractelement <4 x float> %reg5, i32 3
+  %tmp38 = extractelement <4 x float> %reg6, i32 0
+  %tmp39 = extractelement <4 x float> %reg6, i32 1
+  %tmp40 = extractelement <4 x float> %reg6, i32 2
+  %tmp41 = extractelement <4 x float> %reg6, i32 3
+  %tmp42 = extractelement <4 x float> %reg7, i32 0
+  %tmp43 = extractelement <4 x float> %reg7, i32 1
+  %tmp44 = extractelement <4 x float> %reg7, i32 2
+  %tmp45 = extractelement <4 x float> %reg7, i32 3
+  %tmp46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %tmp47 = extractelement <4 x float> %tmp46, i32 0
+  %tmp48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %tmp49 = extractelement <4 x float> %tmp48, i32 1
+  %tmp50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %tmp51 = extractelement <4 x float> %tmp50, i32 2
+  %tmp52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %tmp53 = extractelement <4 x float> %tmp52, i32 0
+  %tmp54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %tmp55 = extractelement <4 x float> %tmp54, i32 0
+  %tmp56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %tmp57 = extractelement <4 x float> %tmp56, i32 1
+  %tmp58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %tmp59 = extractelement <4 x float> %tmp58, i32 2
+  %tmp60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %tmp61 = extractelement <4 x float> %tmp60, i32 3
+  %tmp62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %tmp63 = extractelement <4 x float> %tmp62, i32 0
+  %tmp64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %tmp65 = extractelement <4 x float> %tmp64, i32 1
+  %tmp66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %tmp67 = extractelement <4 x float> %tmp66, i32 2
+  %tmp68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp69 = extractelement <4 x float> %tmp68, i32 0
+  %tmp70 = fcmp oge float %tmp69, 3.500000e+00
+  %tmp71 = sext i1 %tmp70 to i32
+  %tmp72 = bitcast i32 %tmp71 to float
+  %tmp73 = bitcast float %tmp72 to i32
+  %tmp74 = icmp ne i32 %tmp73, 0
+  %. = select i1 %tmp74, float 0.000000e+00, float 0.000000e+00
+  %tmp75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp76 = extractelement <4 x float> %tmp75, i32 0
+  %tmp77 = fcmp oge float %tmp76, 2.000000e+00
+  %tmp78 = sext i1 %tmp77 to i32
+  %tmp79 = bitcast i32 %tmp78 to float
+  %tmp80 = bitcast float %tmp79 to i32
+  %tmp81 = icmp ne i32 %tmp80, 0
+  br i1 %tmp81, label %IF137, label %ENDIF136
 
 IF137:                                            ; preds = %main_body
-  %82 = insertelement <4 x float> undef, float %30, i32 0
-  %83 = insertelement <4 x float> %82, float %31, i32 1
-  %84 = insertelement <4 x float> %83, float %32, i32 2
-  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
-  %86 = insertelement <4 x float> undef, float %30, i32 0
-  %87 = insertelement <4 x float> %86, float %31, i32 1
-  %88 = insertelement <4 x float> %87, float %32, i32 2
-  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
-  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
-  %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
-  %92 = fmul float %30, %91
-  %93 = fmul float %31, %91
-  %94 = fmul float %32, %91
-  %95 = insertelement <4 x float> undef, float %92, i32 0
-  %96 = insertelement <4 x float> %95, float %93, i32 1
-  %97 = insertelement <4 x float> %96, float %94, i32 2
-  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
-  %99 = insertelement <4 x float> undef, float %37, i32 0
-  %100 = insertelement <4 x float> %99, float %38, i32 1
-  %101 = insertelement <4 x float> %100, float %39, i32 2
-  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
-  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
-  %104 = insertelement <4 x float> undef, float %92, i32 0
-  %105 = insertelement <4 x float> %104, float %93, i32 1
-  %106 = insertelement <4 x float> %105, float %94, i32 2
-  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
-  %108 = insertelement <4 x float> undef, float %40, i32 0
-  %109 = insertelement <4 x float> %108, float %41, i32 1
-  %110 = insertelement <4 x float> %109, float %42, i32 2
-  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
-  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
-  %113 = fsub float -0.000000e+00, %92
-  %114 = fsub float -0.000000e+00, %93
-  %115 = fsub float -0.000000e+00, %94
-  %116 = insertelement <4 x float> undef, float %34, i32 0
-  %117 = insertelement <4 x float> %116, float %35, i32 1
-  %118 = insertelement <4 x float> %117, float %36, i32 2
-  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
-  %120 = insertelement <4 x float> undef, float %113, i32 0
-  %121 = insertelement <4 x float> %120, float %114, i32 1
-  %122 = insertelement <4 x float> %121, float %115, i32 2
-  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
-  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
-  %125 = fdiv float 1.000000e+00, %124
-  %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %127 = extractelement <4 x float> %126, i32 0
-  %128 = fmul float %127, %125
-  %129 = fmul float %103, %128
-  %130 = fmul float %112, %128
-  %131 = bitcast float %. to i32
-  %132 = sitofp i32 %131 to float
-  %133 = fdiv float 1.000000e+00, %132
-  %134 = bitcast float %. to i32
-  %135 = add i32 %134, -1
-  %136 = bitcast i32 %135 to float
-  %137 = bitcast float %136 to i32
+  %tmp82 = insertelement <4 x float> undef, float %tmp30, i32 0
+  %tmp83 = insertelement <4 x float> %tmp82, float %tmp31, i32 1
+  %tmp84 = insertelement <4 x float> %tmp83, float %tmp32, i32 2
+  %tmp85 = insertelement <4 x float> %tmp84, float 0.000000e+00, i32 3
+  %tmp86 = insertelement <4 x float> undef, float %tmp30, i32 0
+  %tmp87 = insertelement <4 x float> %tmp86, float %tmp31, i32 1
+  %tmp88 = insertelement <4 x float> %tmp87, float %tmp32, i32 2
+  %tmp89 = insertelement <4 x float> %tmp88, float 0.000000e+00, i32 3
+  %tmp90 = call float @llvm.r600.dot4(<4 x float> %tmp85, <4 x float> %tmp89)
+  %tmp91 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp90)
+  %tmp92 = fmul float %tmp30, %tmp91
+  %tmp93 = fmul float %tmp31, %tmp91
+  %tmp94 = fmul float %tmp32, %tmp91
+  %tmp95 = insertelement <4 x float> undef, float %tmp92, i32 0
+  %tmp96 = insertelement <4 x float> %tmp95, float %tmp93, i32 1
+  %tmp97 = insertelement <4 x float> %tmp96, float %tmp94, i32 2
+  %tmp98 = insertelement <4 x float> %tmp97, float 0.000000e+00, i32 3
+  %tmp99 = insertelement <4 x float> undef, float %tmp37, i32 0
+  %tmp100 = insertelement <4 x float> %tmp99, float %tmp38, i32 1
+  %tmp101 = insertelement <4 x float> %tmp100, float %tmp39, i32 2
+  %tmp102 = insertelement <4 x float> %tmp101, float 0.000000e+00, i32 3
+  %tmp103 = call float @llvm.r600.dot4(<4 x float> %tmp98, <4 x float> %tmp102)
+  %tmp104 = insertelement <4 x float> undef, float %tmp92, i32 0
+  %tmp105 = insertelement <4 x float> %tmp104, float %tmp93, i32 1
+  %tmp106 = insertelement <4 x float> %tmp105, float %tmp94, i32 2
+  %tmp107 = insertelement <4 x float> %tmp106, float 0.000000e+00, i32 3
+  %tmp108 = insertelement <4 x float> undef, float %tmp40, i32 0
+  %tmp109 = insertelement <4 x float> %tmp108, float %tmp41, i32 1
+  %tmp110 = insertelement <4 x float> %tmp109, float %tmp42, i32 2
+  %tmp111 = insertelement <4 x float> %tmp110, float 0.000000e+00, i32 3
+  %tmp112 = call float @llvm.r600.dot4(<4 x float> %tmp107, <4 x float> %tmp111)
+  %tmp113 = fsub float -0.000000e+00, %tmp92
+  %tmp114 = fsub float -0.000000e+00, %tmp93
+  %tmp115 = fsub float -0.000000e+00, %tmp94
+  %tmp116 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp117 = insertelement <4 x float> %tmp116, float %tmp35, i32 1
+  %tmp118 = insertelement <4 x float> %tmp117, float %tmp36, i32 2
+  %tmp119 = insertelement <4 x float> %tmp118, float 0.000000e+00, i32 3
+  %tmp120 = insertelement <4 x float> undef, float %tmp113, i32 0
+  %tmp121 = insertelement <4 x float> %tmp120, float %tmp114, i32 1
+  %tmp122 = insertelement <4 x float> %tmp121, float %tmp115, i32 2
+  %tmp123 = insertelement <4 x float> %tmp122, float 0.000000e+00, i32 3
+  %tmp124 = call float @llvm.r600.dot4(<4 x float> %tmp119, <4 x float> %tmp123)
+  %tmp125 = fdiv float 1.000000e+00, %tmp124
+  %tmp126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp127 = extractelement <4 x float> %tmp126, i32 0
+  %tmp128 = fmul float %tmp127, %tmp125
+  %tmp129 = fmul float %tmp103, %tmp128
+  %tmp130 = fmul float %tmp112, %tmp128
+  %tmp131 = bitcast float %. to i32
+  %tmp132 = sitofp i32 %tmp131 to float
+  %tmp133 = fdiv float 1.000000e+00, %tmp132
+  %tmp134 = bitcast float %. to i32
+  %tmp135 = add i32 %tmp134, -1
+  %tmp136 = bitcast i32 %tmp135 to float
+  %tmp137 = bitcast float %tmp136 to i32
   br label %LOOP
 
-ENDIF136:                                         ; preds = %main_body, %ENDIF154
-  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
-  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
-  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
-  %138 = fmul float %26, 0x3F847AE140000000
-  %139 = fmul float %27, 0x3F847AE140000000
-  %140 = fmul float %28, 0x3F847AE140000000
-  %141 = insertelement <4 x float> undef, float %138, i32 0
-  %142 = insertelement <4 x float> %141, float %139, i32 1
-  %143 = insertelement <4 x float> %142, float %140, i32 2
-  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
-  %145 = extractelement <4 x float> %144, i32 0
-  %146 = extractelement <4 x float> %144, i32 1
-  %147 = extractelement <4 x float> %144, i32 2
-  %148 = extractelement <4 x float> %144, i32 3
-  %149 = insertelement <4 x float> undef, float %145, i32 0
-  %150 = insertelement <4 x float> %149, float %146, i32 1
-  %151 = insertelement <4 x float> %150, float %147, i32 2
-  %152 = insertelement <4 x float> %151, float %148, i32 3
-  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
-  %154 = extractelement <4 x float> %153, i32 0
-  %155 = extractelement <4 x float> %153, i32 1
-  %156 = extractelement <4 x float> %153, i32 2
-  %157 = extractelement <4 x float> %153, i32 3
-  %158 = fmul float %26, 0x3F45A07B40000000
-  %159 = fmul float %27, 0x3F45A07B40000000
-  %160 = fmul float %28, 0x3F45A07B40000000
-  %161 = insertelement <4 x float> undef, float %158, i32 0
-  %162 = insertelement <4 x float> %161, float %159, i32 1
-  %163 = insertelement <4 x float> %162, float %160, i32 2
-  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
-  %165 = extractelement <4 x float> %164, i32 0
-  %166 = extractelement <4 x float> %164, i32 1
-  %167 = extractelement <4 x float> %164, i32 2
-  %168 = extractelement <4 x float> %164, i32 3
-  %169 = insertelement <4 x float> undef, float %165, i32 0
-  %170 = insertelement <4 x float> %169, float %166, i32 1
-  %171 = insertelement <4 x float> %170, float %167, i32 2
-  %172 = insertelement <4 x float> %171, float %168, i32 3
-  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
-  %174 = extractelement <4 x float> %173, i32 0
-  %175 = extractelement <4 x float> %173, i32 1
-  %176 = extractelement <4 x float> %173, i32 2
-  %177 = extractelement <4 x float> %173, i32 3
-  %178 = fmul float %176, 3.000000e+03
-  %179 = fadd float %178, %28
-  %180 = fdiv float 1.000000e+00, %33
-  %181 = fmul float %32, %180
-  %182 = call float @fabs(float %181)
-  %183 = fmul float %174, 0x3FD99999A0000000
-  %184 = fadd float %183, 0x3FAEB851E0000000
-  %185 = fmul float %175, 0x3FE3333340000000
-  %186 = fadd float %185, %184
-  %187 = fmul float %176, 2.000000e+00
-  %188 = fadd float %187, %186
-  %189 = fmul float %177, 4.000000e+00
-  %190 = fadd float %189, %188
-  %191 = fmul float %154, 0x3FB99999A0000000
-  %192 = fadd float %191, %190
-  %193 = fmul float %155, 0x3FD99999A0000000
-  %194 = fadd float %193, %192
-  %195 = fmul float %156, 0x3FE99999A0000000
-  %196 = fadd float %195, %194
-  %197 = fmul float %157, 0x4000CCCCC0000000
-  %198 = fadd float %197, %196
-  %199 = fmul float 0xBE5EFB4CC0000000, %182
-  %200 = fmul float %199, %182
-  %201 = call float @llvm.AMDIL.exp.(float %200)
-  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
-  %203 = fadd float %202, 0x3FF4CCCCC0000000
-  %204 = fmul float %203, 0x3FE1C71C80000000
-  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
-  %206 = fadd float %202, 0x3FF4CCCCC0000000
-  %207 = fmul float %206, 0x3FE1C71C80000000
-  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
-  %209 = fadd float %202, 2.000000e+00
-  %210 = fmul float %209, 0x3FD611A7A0000000
-  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
-  %212 = fmul float 2.000000e+00, %205
-  %213 = fsub float -0.000000e+00, %212
-  %214 = fadd float 3.000000e+00, %213
-  %215 = fmul float %205, %214
-  %216 = fmul float %205, %215
-  %217 = fmul float 2.000000e+00, %208
-  %218 = fsub float -0.000000e+00, %217
-  %219 = fadd float 3.000000e+00, %218
-  %220 = fmul float %208, %219
-  %221 = fmul float %208, %220
-  %222 = fmul float 2.000000e+00, %211
-  %223 = fsub float -0.000000e+00, %222
-  %224 = fadd float 3.000000e+00, %223
-  %225 = fmul float %211, %224
-  %226 = fmul float %211, %225
-  %227 = fmul float %26, 0x3F368B5CC0000000
-  %228 = fmul float %27, 0x3F368B5CC0000000
-  %229 = insertelement <4 x float> undef, float %227, i32 0
-  %230 = insertelement <4 x float> %229, float %228, i32 1
-  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
-  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
-  %233 = extractelement <4 x float> %232, i32 0
-  %234 = extractelement <4 x float> %232, i32 1
-  %235 = insertelement <4 x float> undef, float %233, i32 0
-  %236 = insertelement <4 x float> %235, float %234, i32 1
-  %237 = insertelement <4 x float> %236, float undef, i32 2
-  %238 = insertelement <4 x float> %237, float undef, i32 3
-  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
-  %240 = extractelement <4 x float> %239, i32 0
-  %241 = insertelement <4 x float> undef, float %240, i32 0
-  %242 = insertelement <4 x float> %241, float %228, i32 1
-  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
-  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
-  %245 = extractelement <4 x float> %244, i32 0
-  %246 = insertelement <4 x float> undef, float %245, i32 0
-  %247 = insertelement <4 x float> %246, float undef, i32 1
-  %248 = insertelement <4 x float> %247, float undef, i32 2
-  %249 = insertelement <4 x float> %248, float undef, i32 3
-  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
-  %251 = extractelement <4 x float> %250, i32 0
-  %252 = extractelement <4 x float> %250, i32 1
-  %253 = extractelement <4 x float> %250, i32 2
-  %254 = extractelement <4 x float> %250, i32 3
-  %255 = fmul float %251, %216
-  %256 = fmul float %252, %221
-  %257 = fmul float %253, %226
-  %258 = fmul float %254, 0.000000e+00
-  %259 = fadd float %202, 0x3FF4CCCCC0000000
-  %260 = fmul float %259, 0x3FE1C71C80000000
-  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
-  %262 = fadd float %202, 0x3FF4CCCCC0000000
-  %263 = fmul float %262, 0x3FE1C71C80000000
-  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
-  %265 = fadd float %202, 2.000000e+00
-  %266 = fmul float %265, 0x3FD611A7A0000000
-  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
-  %268 = fmul float 2.000000e+00, %261
-  %269 = fsub float -0.000000e+00, %268
-  %270 = fadd float 3.000000e+00, %269
-  %271 = fmul float %261, %270
-  %272 = fmul float %261, %271
-  %273 = fmul float 2.000000e+00, %264
-  %274 = fsub float -0.000000e+00, %273
-  %275 = fadd float 3.000000e+00, %274
-  %276 = fmul float %264, %275
-  %277 = fmul float %264, %276
-  %278 = fmul float 2.000000e+00, %267
-  %279 = fsub float -0.000000e+00, %278
-  %280 = fadd float 3.000000e+00, %279
-  %281 = fmul float %267, %280
-  %282 = fmul float %267, %281
-  %283 = fmul float %26, 0x3F22DFD6A0000000
-  %284 = fmul float %27, 0x3F22DFD6A0000000
-  %285 = insertelement <4 x float> undef, float %283, i32 0
-  %286 = insertelement <4 x float> %285, float %284, i32 1
-  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
-  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
-  %289 = extractelement <4 x float> %288, i32 0
-  %290 = extractelement <4 x float> %288, i32 1
-  %291 = insertelement <4 x float> undef, float %289, i32 0
-  %292 = insertelement <4 x float> %291, float %290, i32 1
-  %293 = insertelement <4 x float> %292, float undef, i32 2
-  %294 = insertelement <4 x float> %293, float undef, i32 3
-  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
-  %296 = extractelement <4 x float> %295, i32 0
-  %297 = extractelement <4 x float> %295, i32 1
-  %298 = extractelement <4 x float> %295, i32 2
-  %299 = extractelement <4 x float> %295, i32 3
-  %300 = fmul float %296, %272
-  %301 = fmul float %297, %277
-  %302 = fmul float %298, %282
-  %303 = fmul float %299, 0.000000e+00
-  %304 = fmul float %temp68.1, %37
-  %305 = fmul float %temp68.1, %38
-  %306 = fmul float %temp68.1, %39
-  %307 = fmul float %temp69.0, %40
-  %308 = fadd float %307, %304
-  %309 = fmul float %temp69.0, %41
-  %310 = fadd float %309, %305
-  %311 = fmul float %temp69.0, %42
-  %312 = fadd float %311, %306
-  %313 = fmul float %temp70.0, %34
-  %314 = fadd float %313, %308
-  %315 = fmul float %temp70.0, %35
-  %316 = fadd float %315, %310
-  %317 = fmul float %temp70.0, %36
-  %318 = fadd float %317, %312
-  %319 = insertelement <4 x float> undef, float %314, i32 0
-  %320 = insertelement <4 x float> %319, float %316, i32 1
-  %321 = insertelement <4 x float> %320, float %318, i32 2
-  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
-  %323 = insertelement <4 x float> undef, float %314, i32 0
-  %324 = insertelement <4 x float> %323, float %316, i32 1
-  %325 = insertelement <4 x float> %324, float %318, i32 2
-  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
-  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
-  %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
-  %329 = fmul float %314, %328
-  %330 = fmul float %316, %328
-  %331 = fmul float %318, %328
-  %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %333 = extractelement <4 x float> %332, i32 0
-  %334 = fsub float -0.000000e+00, %333
-  %335 = fadd float 1.000000e+00, %334
-  %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %337 = extractelement <4 x float> %336, i32 0
-  %338 = fsub float -0.000000e+00, %337
-  %339 = fadd float 1.000000e+00, %338
-  %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %341 = extractelement <4 x float> %340, i32 0
-  %342 = fsub float -0.000000e+00, %341
-  %343 = fadd float 1.000000e+00, %342
-  %344 = fsub float -0.000000e+00, %335
-  %345 = fadd float %202, %344
-  %346 = fsub float -0.000000e+00, %339
-  %347 = fadd float %202, %346
-  %348 = fadd float %347, 0xBFE3333340000000
-  %349 = fsub float -0.000000e+00, %202
-  %350 = fsub float -0.000000e+00, %343
-  %351 = fadd float %349, %350
-  %352 = insertelement <4 x float> undef, float %43, i32 0
-  %353 = insertelement <4 x float> %352, float %44, i32 1
-  %354 = insertelement <4 x float> %353, float %45, i32 2
-  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
-  %356 = insertelement <4 x float> undef, float %43, i32 0
-  %357 = insertelement <4 x float> %356, float %44, i32 1
-  %358 = insertelement <4 x float> %357, float %45, i32 2
-  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
-  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
-  %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
-  %362 = fmul float %45, %361
-  %363 = call float @fabs(float %362)
-  %364 = fmul float %176, 0x3FECCCCCC0000000
-  %365 = fadd float %364, %363
-  %366 = fadd float %365, 0xBFEFAE1480000000
-  %367 = fmul float %366, 0xC023FFFFC0000000
-  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
-  %369 = fsub float -0.000000e+00, %335
-  %370 = fadd float %202, %369
-  %371 = fadd float %370, 0x3FBEB851E0000000
-  %372 = fsub float -0.000000e+00, %339
-  %373 = fadd float %202, %372
-  %374 = fadd float %373, 0xBFE0A3D700000000
-  %375 = fsub float -0.000000e+00, %202
-  %376 = fsub float -0.000000e+00, %343
-  %377 = fadd float %375, %376
-  %378 = insertelement <4 x float> undef, float %43, i32 0
-  %379 = insertelement <4 x float> %378, float %44, i32 1
-  %380 = insertelement <4 x float> %379, float %45, i32 2
-  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
-  %382 = insertelement <4 x float> undef, float %43, i32 0
-  %383 = insertelement <4 x float> %382, float %44, i32 1
-  %384 = insertelement <4 x float> %383, float %45, i32 2
-  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
-  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
-  %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
-  %388 = fmul float %45, %387
-  %389 = call float @fabs(float %388)
-  %390 = fmul float %176, 0x3FF51EB860000000
-  %391 = fadd float %390, %389
-  %392 = fadd float %391, 0xBFEFAE1480000000
-  %393 = fmul float %392, 0xC0490001A0000000
-  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
-  %395 = fmul float 2.000000e+00, %368
-  %396 = fsub float -0.000000e+00, %395
-  %397 = fadd float 3.000000e+00, %396
-  %398 = fmul float %368, %397
-  %399 = fmul float %368, %398
-  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
-  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
-  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
-  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
-  %404 = fmul float 2.000000e+00, %394
-  %405 = fsub float -0.000000e+00, %404
-  %406 = fadd float 3.000000e+00, %405
-  %407 = fmul float %394, %406
-  %408 = fmul float %394, %407
-  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
-  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
-  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
-  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
-  %413 = fcmp oge float 2.200000e+03, %179
-  %414 = sext i1 %413 to i32
-  %415 = bitcast i32 %414 to float
-  %416 = bitcast float %415 to i32
-  %417 = icmp ne i32 %416, 0
-  br i1 %417, label %IF161, label %ENDIF160
+ENDIF136:                                         ; preds = %ENDIF154, %main_body
+  %temp68.1 = phi float [ %tmp603, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp69.0 = phi float [ %tmp605, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp70.0 = phi float [ %tmp607, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+  %tmp138 = fmul float %tmp26, 0x3F847AE140000000
+  %tmp139 = fmul float %tmp27, 0x3F847AE140000000
+  %tmp140 = fmul float %tmp28, 0x3F847AE140000000
+  %tmp141 = insertelement <4 x float> undef, float %tmp138, i32 0
+  %tmp142 = insertelement <4 x float> %tmp141, float %tmp139, i32 1
+  %tmp143 = insertelement <4 x float> %tmp142, float %tmp140, i32 2
+  %tmp144 = insertelement <4 x float> %tmp143, float 0.000000e+00, i32 3
+  %tmp145 = extractelement <4 x float> %tmp144, i32 0
+  %tmp146 = extractelement <4 x float> %tmp144, i32 1
+  %tmp147 = extractelement <4 x float> %tmp144, i32 2
+  %tmp148 = extractelement <4 x float> %tmp144, i32 3
+  %tmp149 = insertelement <4 x float> undef, float %tmp145, i32 0
+  %tmp150 = insertelement <4 x float> %tmp149, float %tmp146, i32 1
+  %tmp151 = insertelement <4 x float> %tmp150, float %tmp147, i32 2
+  %tmp152 = insertelement <4 x float> %tmp151, float %tmp148, i32 3
+  %tmp153 = shufflevector <4 x float> %tmp152, <4 x float> %tmp152, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp154 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp153, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp155 = extractelement <4 x float> %tmp154, i32 0
+  %tmp156 = extractelement <4 x float> %tmp154, i32 1
+  %tmp157 = extractelement <4 x float> %tmp154, i32 2
+  %tmp158 = extractelement <4 x float> %tmp154, i32 3
+  %tmp159 = fmul float %tmp26, 0x3F45A07B40000000
+  %tmp160 = fmul float %tmp27, 0x3F45A07B40000000
+  %tmp161 = fmul float %tmp28, 0x3F45A07B40000000
+  %tmp162 = insertelement <4 x float> undef, float %tmp159, i32 0
+  %tmp163 = insertelement <4 x float> %tmp162, float %tmp160, i32 1
+  %tmp164 = insertelement <4 x float> %tmp163, float %tmp161, i32 2
+  %tmp165 = insertelement <4 x float> %tmp164, float 0.000000e+00, i32 3
+  %tmp166 = extractelement <4 x float> %tmp165, i32 0
+  %tmp167 = extractelement <4 x float> %tmp165, i32 1
+  %tmp168 = extractelement <4 x float> %tmp165, i32 2
+  %tmp169 = extractelement <4 x float> %tmp165, i32 3
+  %tmp170 = insertelement <4 x float> undef, float %tmp166, i32 0
+  %tmp171 = insertelement <4 x float> %tmp170, float %tmp167, i32 1
+  %tmp172 = insertelement <4 x float> %tmp171, float %tmp168, i32 2
+  %tmp173 = insertelement <4 x float> %tmp172, float %tmp169, i32 3
+  %tmp174 = shufflevector <4 x float> %tmp173, <4 x float> %tmp173, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp175 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp174, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp176 = extractelement <4 x float> %tmp175, i32 0
+  %tmp177 = extractelement <4 x float> %tmp175, i32 1
+  %tmp178 = extractelement <4 x float> %tmp175, i32 2
+  %tmp179 = extractelement <4 x float> %tmp175, i32 3
+  %tmp180 = fmul float %tmp178, 3.000000e+03
+  %tmp181 = fadd float %tmp180, %tmp28
+  %tmp182 = fdiv float 1.000000e+00, %tmp33
+  %tmp183 = fmul float %tmp32, %tmp182
+  %tmp184 = call float @llvm.fabs.f32(float %tmp183)
+  %tmp185 = fmul float %tmp176, 0x3FD99999A0000000
+  %tmp186 = fadd float %tmp185, 0x3FAEB851E0000000
+  %tmp187 = fmul float %tmp177, 0x3FE3333340000000
+  %tmp188 = fadd float %tmp187, %tmp186
+  %tmp189 = fmul float %tmp178, 2.000000e+00
+  %tmp190 = fadd float %tmp189, %tmp188
+  %tmp191 = fmul float %tmp179, 4.000000e+00
+  %tmp192 = fadd float %tmp191, %tmp190
+  %tmp193 = fmul float %tmp155, 0x3FB99999A0000000
+  %tmp194 = fadd float %tmp193, %tmp192
+  %tmp195 = fmul float %tmp156, 0x3FD99999A0000000
+  %tmp196 = fadd float %tmp195, %tmp194
+  %tmp197 = fmul float %tmp157, 0x3FE99999A0000000
+  %tmp198 = fadd float %tmp197, %tmp196
+  %tmp199 = fmul float %tmp158, 0x4000CCCCC0000000
+  %tmp200 = fadd float %tmp199, %tmp198
+  %tmp201 = fmul float 0xBE5EFB4CC0000000, %tmp184
+  %tmp202 = fmul float %tmp201, %tmp184
+  %tmp203 = call float @llvm.exp2.f32(float %tmp202)
+  %one.sub.a.i = fsub float 1.000000e+00, %tmp203
+  %one.sub.ac.i = fmul float %one.sub.a.i, 0x3FA99999A0000000
+  %mul.i = fmul float %tmp200, 0x3FA99999A0000000
+  %result.i = fadd float %mul.i, %one.sub.ac.i
+  %tmp204 = fadd float %result.i, 0x3FF4CCCCC0000000
+  %tmp205 = fmul float %tmp204, 0x3FE1C71C80000000
+  %tmp206 = call float @llvm.AMDGPU.clamp.f32(float %tmp205, float 0.000000e+00, float 1.000000e+00)
+  %tmp207 = fadd float %result.i, 0x3FF4CCCCC0000000
+  %tmp208 = fmul float %tmp207, 0x3FE1C71C80000000
+  %tmp209 = call float @llvm.AMDGPU.clamp.f32(float %tmp208, float 0.000000e+00, float 1.000000e+00)
+  %tmp210 = fadd float %result.i, 2.000000e+00
+  %tmp211 = fmul float %tmp210, 0x3FD611A7A0000000
+  %tmp212 = call float @llvm.AMDGPU.clamp.f32(float %tmp211, float 0.000000e+00, float 1.000000e+00)
+  %tmp213 = fmul float 2.000000e+00, %tmp206
+  %tmp214 = fsub float -0.000000e+00, %tmp213
+  %tmp215 = fadd float 3.000000e+00, %tmp214
+  %tmp216 = fmul float %tmp206, %tmp215
+  %tmp217 = fmul float %tmp206, %tmp216
+  %tmp218 = fmul float 2.000000e+00, %tmp209
+  %tmp219 = fsub float -0.000000e+00, %tmp218
+  %tmp220 = fadd float 3.000000e+00, %tmp219
+  %tmp221 = fmul float %tmp209, %tmp220
+  %tmp222 = fmul float %tmp209, %tmp221
+  %tmp223 = fmul float 2.000000e+00, %tmp212
+  %tmp224 = fsub float -0.000000e+00, %tmp223
+  %tmp225 = fadd float 3.000000e+00, %tmp224
+  %tmp226 = fmul float %tmp212, %tmp225
+  %tmp227 = fmul float %tmp212, %tmp226
+  %tmp228 = fmul float %tmp26, 0x3F368B5CC0000000
+  %tmp229 = fmul float %tmp27, 0x3F368B5CC0000000
+  %tmp230 = insertelement <4 x float> undef, float %tmp228, i32 0
+  %tmp231 = insertelement <4 x float> %tmp230, float %tmp229, i32 1
+  %tmp232 = insertelement <4 x float> %tmp231, float 0.000000e+00, i32 2
+  %tmp233 = insertelement <4 x float> %tmp232, float 0.000000e+00, i32 3
+  %tmp234 = extractelement <4 x float> %tmp233, i32 0
+  %tmp235 = extractelement <4 x float> %tmp233, i32 1
+  %tmp236 = insertelement <4 x float> undef, float %tmp234, i32 0
+  %tmp237 = insertelement <4 x float> %tmp236, float %tmp235, i32 1
+  %tmp238 = insertelement <4 x float> %tmp237, float undef, i32 2
+  %tmp239 = insertelement <4 x float> %tmp238, float undef, i32 3
+  %tmp240 = shufflevector <4 x float> %tmp239, <4 x float> %tmp239, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp241 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp240, i32 0, i32 0, i32 0, i32 17, i32 1, i32 1, i32 1, i32 1, i32 1)
+  %tmp242 = extractelement <4 x float> %tmp241, i32 0
+  %tmp243 = insertelement <4 x float> undef, float %tmp242, i32 0
+  %tmp244 = insertelement <4 x float> %tmp243, float %tmp229, i32 1
+  %tmp245 = insertelement <4 x float> %tmp244, float 0.000000e+00, i32 2
+  %tmp246 = insertelement <4 x float> %tmp245, float 0.000000e+00, i32 3
+  %tmp247 = extractelement <4 x float> %tmp246, i32 0
+  %tmp248 = insertelement <4 x float> undef, float %tmp247, i32 0
+  %tmp249 = insertelement <4 x float> %tmp248, float undef, i32 1
+  %tmp250 = insertelement <4 x float> %tmp249, float undef, i32 2
+  %tmp251 = insertelement <4 x float> %tmp250, float undef, i32 3
+  %tmp252 = shufflevector <4 x float> %tmp251, <4 x float> %tmp251, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp253 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp252, i32 0, i32 0, i32 0, i32 18, i32 2, i32 1, i32 1, i32 1, i32 1)
+  %tmp254 = extractelement <4 x float> %tmp253, i32 0
+  %tmp255 = extractelement <4 x float> %tmp253, i32 1
+  %tmp256 = extractelement <4 x float> %tmp253, i32 2
+  %tmp257 = extractelement <4 x float> %tmp253, i32 3
+  %tmp258 = fmul float %tmp254, %tmp217
+  %tmp259 = fmul float %tmp255, %tmp222
+  %tmp260 = fmul float %tmp256, %tmp227
+  %tmp261 = fmul float %tmp257, 0.000000e+00
+  %tmp262 = fadd float %result.i, 0x3FF4CCCCC0000000
+  %tmp263 = fmul float %tmp262, 0x3FE1C71C80000000
+  %tmp264 = call float @llvm.AMDGPU.clamp.f32(float %tmp263, float 0.000000e+00, float 1.000000e+00)
+  %tmp265 = fadd float %result.i, 0x3FF4CCCCC0000000
+  %tmp266 = fmul float %tmp265, 0x3FE1C71C80000000
+  %tmp267 = call float @llvm.AMDGPU.clamp.f32(float %tmp266, float 0.000000e+00, float 1.000000e+00)
+  %tmp268 = fadd float %result.i, 2.000000e+00
+  %tmp269 = fmul float %tmp268, 0x3FD611A7A0000000
+  %tmp270 = call float @llvm.AMDGPU.clamp.f32(float %tmp269, float 0.000000e+00, float 1.000000e+00)
+  %tmp271 = fmul float 2.000000e+00, %tmp264
+  %tmp272 = fsub float -0.000000e+00, %tmp271
+  %tmp273 = fadd float 3.000000e+00, %tmp272
+  %tmp274 = fmul float %tmp264, %tmp273
+  %tmp275 = fmul float %tmp264, %tmp274
+  %tmp276 = fmul float 2.000000e+00, %tmp267
+  %tmp277 = fsub float -0.000000e+00, %tmp276
+  %tmp278 = fadd float 3.000000e+00, %tmp277
+  %tmp279 = fmul float %tmp267, %tmp278
+  %tmp280 = fmul float %tmp267, %tmp279
+  %tmp281 = fmul float 2.000000e+00, %tmp270
+  %tmp282 = fsub float -0.000000e+00, %tmp281
+  %tmp283 = fadd float 3.000000e+00, %tmp282
+  %tmp284 = fmul float %tmp270, %tmp283
+  %tmp285 = fmul float %tmp270, %tmp284
+  %tmp286 = fmul float %tmp26, 0x3F22DFD6A0000000
+  %tmp287 = fmul float %tmp27, 0x3F22DFD6A0000000
+  %tmp288 = insertelement <4 x float> undef, float %tmp286, i32 0
+  %tmp289 = insertelement <4 x float> %tmp288, float %tmp287, i32 1
+  %tmp290 = insertelement <4 x float> %tmp289, float 0.000000e+00, i32 2
+  %tmp291 = insertelement <4 x float> %tmp290, float 0.000000e+00, i32 3
+  %tmp292 = extractelement <4 x float> %tmp291, i32 0
+  %tmp293 = extractelement <4 x float> %tmp291, i32 1
+  %tmp294 = insertelement <4 x float> undef, float %tmp292, i32 0
+  %tmp295 = insertelement <4 x float> %tmp294, float %tmp293, i32 1
+  %tmp296 = insertelement <4 x float> %tmp295, float undef, i32 2
+  %tmp297 = insertelement <4 x float> %tmp296, float undef, i32 3
+  %tmp298 = shufflevector <4 x float> %tmp297, <4 x float> %tmp297, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp299 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp298, i32 0, i32 0, i32 0, i32 19, i32 3, i32 1, i32 1, i32 1, i32 1)
+  %tmp300 = extractelement <4 x float> %tmp299, i32 0
+  %tmp301 = extractelement <4 x float> %tmp299, i32 1
+  %tmp302 = extractelement <4 x float> %tmp299, i32 2
+  %tmp303 = extractelement <4 x float> %tmp299, i32 3
+  %tmp304 = fmul float %tmp300, %tmp275
+  %tmp305 = fmul float %tmp301, %tmp280
+  %tmp306 = fmul float %tmp302, %tmp285
+  %tmp307 = fmul float %tmp303, 0.000000e+00
+  %tmp308 = fmul float %temp68.1, %tmp37
+  %tmp309 = fmul float %temp68.1, %tmp38
+  %tmp310 = fmul float %temp68.1, %tmp39
+  %tmp311 = fmul float %temp69.0, %tmp40
+  %tmp312 = fadd float %tmp311, %tmp308
+  %tmp313 = fmul float %temp69.0, %tmp41
+  %tmp314 = fadd float %tmp313, %tmp309
+  %tmp315 = fmul float %temp69.0, %tmp42
+  %tmp316 = fadd float %tmp315, %tmp310
+  %tmp317 = fmul float %temp70.0, %tmp34
+  %tmp318 = fadd float %tmp317, %tmp312
+  %tmp319 = fmul float %temp70.0, %tmp35
+  %tmp320 = fadd float %tmp319, %tmp314
+  %tmp321 = fmul float %temp70.0, %tmp36
+  %tmp322 = fadd float %tmp321, %tmp316
+  %tmp323 = insertelement <4 x float> undef, float %tmp318, i32 0
+  %tmp324 = insertelement <4 x float> %tmp323, float %tmp320, i32 1
+  %tmp325 = insertelement <4 x float> %tmp324, float %tmp322, i32 2
+  %tmp326 = insertelement <4 x float> %tmp325, float 0.000000e+00, i32 3
+  %tmp327 = insertelement <4 x float> undef, float %tmp318, i32 0
+  %tmp328 = insertelement <4 x float> %tmp327, float %tmp320, i32 1
+  %tmp329 = insertelement <4 x float> %tmp328, float %tmp322, i32 2
+  %tmp330 = insertelement <4 x float> %tmp329, float 0.000000e+00, i32 3
+  %tmp331 = call float @llvm.r600.dot4(<4 x float> %tmp326, <4 x float> %tmp330)
+  %tmp332 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp331)
+  %tmp333 = fmul float %tmp318, %tmp332
+  %tmp334 = fmul float %tmp320, %tmp332
+  %tmp335 = fmul float %tmp322, %tmp332
+  %tmp336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %tmp337 = extractelement <4 x float> %tmp336, i32 0
+  %tmp338 = fsub float -0.000000e+00, %tmp337
+  %tmp339 = fadd float 1.000000e+00, %tmp338
+  %tmp340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp341 = extractelement <4 x float> %tmp340, i32 0
+  %tmp342 = fsub float -0.000000e+00, %tmp341
+  %tmp343 = fadd float 1.000000e+00, %tmp342
+  %tmp344 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp345 = extractelement <4 x float> %tmp344, i32 0
+  %tmp346 = fsub float -0.000000e+00, %tmp345
+  %tmp347 = fadd float 1.000000e+00, %tmp346
+  %tmp348 = fsub float -0.000000e+00, %tmp339
+  %tmp349 = fadd float %result.i, %tmp348
+  %tmp350 = fsub float -0.000000e+00, %tmp343
+  %tmp351 = fadd float %result.i, %tmp350
+  %tmp352 = fadd float %tmp351, 0xBFE3333340000000
+  %tmp353 = fsub float -0.000000e+00, %result.i
+  %tmp354 = fsub float -0.000000e+00, %tmp347
+  %tmp355 = fadd float %tmp353, %tmp354
+  %tmp356 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp357 = insertelement <4 x float> %tmp356, float %tmp44, i32 1
+  %tmp358 = insertelement <4 x float> %tmp357, float %tmp45, i32 2
+  %tmp359 = insertelement <4 x float> %tmp358, float 0.000000e+00, i32 3
+  %tmp360 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp361 = insertelement <4 x float> %tmp360, float %tmp44, i32 1
+  %tmp362 = insertelement <4 x float> %tmp361, float %tmp45, i32 2
+  %tmp363 = insertelement <4 x float> %tmp362, float 0.000000e+00, i32 3
+  %tmp364 = call float @llvm.r600.dot4(<4 x float> %tmp359, <4 x float> %tmp363)
+  %tmp365 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp364)
+  %tmp366 = fmul float %tmp45, %tmp365
+  %tmp367 = call float @llvm.fabs.f32(float %tmp366)
+  %tmp368 = fmul float %tmp178, 0x3FECCCCCC0000000
+  %tmp369 = fadd float %tmp368, %tmp367
+  %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000
+  %tmp371 = fmul float %tmp370, 0xC023FFFFC0000000
+  %tmp372 = call float @llvm.AMDGPU.clamp.f32(float %tmp371, float 0.000000e+00, float 1.000000e+00)
+  %tmp373 = fsub float -0.000000e+00, %tmp339
+  %tmp374 = fadd float %result.i, %tmp373
+  %tmp375 = fadd float %tmp374, 0x3FBEB851E0000000
+  %tmp376 = fsub float -0.000000e+00, %tmp343
+  %tmp377 = fadd float %result.i, %tmp376
+  %tmp378 = fadd float %tmp377, 0xBFE0A3D700000000
+  %tmp379 = fsub float -0.000000e+00, %result.i
+  %tmp380 = fsub float -0.000000e+00, %tmp347
+  %tmp381 = fadd float %tmp379, %tmp380
+  %tmp382 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp383 = insertelement <4 x float> %tmp382, float %tmp44, i32 1
+  %tmp384 = insertelement <4 x float> %tmp383, float %tmp45, i32 2
+  %tmp385 = insertelement <4 x float> %tmp384, float 0.000000e+00, i32 3
+  %tmp386 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp387 = insertelement <4 x float> %tmp386, float %tmp44, i32 1
+  %tmp388 = insertelement <4 x float> %tmp387, float %tmp45, i32 2
+  %tmp389 = insertelement <4 x float> %tmp388, float 0.000000e+00, i32 3
+  %tmp390 = call float @llvm.r600.dot4(<4 x float> %tmp385, <4 x float> %tmp389)
+  %tmp391 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp390)
+  %tmp392 = fmul float %tmp45, %tmp391
+  %tmp393 = call float @llvm.fabs.f32(float %tmp392)
+  %tmp394 = fmul float %tmp178, 0x3FF51EB860000000
+  %tmp395 = fadd float %tmp394, %tmp393
+  %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000
+  %tmp397 = fmul float %tmp396, 0xC0490001A0000000
+  %tmp398 = call float @llvm.AMDGPU.clamp.f32(float %tmp397, float 0.000000e+00, float 1.000000e+00)
+  %tmp399 = fmul float 2.000000e+00, %tmp372
+  %tmp400 = fsub float -0.000000e+00, %tmp399
+  %tmp401 = fadd float 3.000000e+00, %tmp400
+  %tmp402 = fmul float %tmp372, %tmp401
+  %tmp403 = fmul float %tmp372, %tmp402
+  %one.sub.a.i169 = fsub float 1.000000e+00, %tmp403
+  %one.sub.ac.i170 = fmul float %one.sub.a.i169, %tmp349
+  %mul.i171 = fmul float %tmp258, %tmp349
+  %result.i172 = fadd float %mul.i171, %one.sub.ac.i170
+  %one.sub.a.i165 = fsub float 1.000000e+00, %tmp403
+  %one.sub.ac.i166 = fmul float %one.sub.a.i165, %tmp352
+  %mul.i167 = fmul float %tmp259, %tmp352
+  %result.i168 = fadd float %mul.i167, %one.sub.ac.i166
+  %one.sub.a.i161 = fsub float 1.000000e+00, %tmp403
+  %one.sub.ac.i162 = fmul float %one.sub.a.i161, %tmp355
+  %mul.i163 = fmul float %tmp260, %tmp355
+  %result.i164 = fadd float %mul.i163, %one.sub.ac.i162
+  %one.sub.a.i157 = fsub float 1.000000e+00, %tmp403
+  %one.sub.ac.i158 = fmul float %one.sub.a.i157, 0.000000e+00
+  %mul.i159 = fmul float %tmp261, 0.000000e+00
+  %result.i160 = fadd float %mul.i159, %one.sub.ac.i158
+  %tmp404 = fmul float 2.000000e+00, %tmp398
+  %tmp405 = fsub float -0.000000e+00, %tmp404
+  %tmp406 = fadd float 3.000000e+00, %tmp405
+  %tmp407 = fmul float %tmp398, %tmp406
+  %tmp408 = fmul float %tmp398, %tmp407
+  %one.sub.a.i153 = fsub float 1.000000e+00, %tmp408
+  %one.sub.ac.i154 = fmul float %one.sub.a.i153, %tmp375
+  %mul.i155 = fmul float %tmp258, %tmp375
+  %result.i156 = fadd float %mul.i155, %one.sub.ac.i154
+  %one.sub.a.i149 = fsub float 1.000000e+00, %tmp408
+  %one.sub.ac.i150 = fmul float %one.sub.a.i149, %tmp378
+  %mul.i151 = fmul float %tmp259, %tmp378
+  %result.i152 = fadd float %mul.i151, %one.sub.ac.i150
+  %one.sub.a.i145 = fsub float 1.000000e+00, %tmp408
+  %one.sub.ac.i146 = fmul float %one.sub.a.i145, %tmp381
+  %mul.i147 = fmul float %tmp260, %tmp381
+  %result.i148 = fadd float %mul.i147, %one.sub.ac.i146
+  %one.sub.a.i141 = fsub float 1.000000e+00, %tmp408
+  %one.sub.ac.i142 = fmul float %one.sub.a.i141, 0x3FD3333340000000
+  %mul.i143 = fmul float %tmp261, 0x3FD3333340000000
+  %result.i144 = fadd float %mul.i143, %one.sub.ac.i142
+  %tmp409 = fcmp oge float 2.200000e+03, %tmp181
+  %tmp410 = sext i1 %tmp409 to i32
+  %tmp411 = bitcast i32 %tmp410 to float
+  %tmp412 = bitcast float %tmp411 to i32
+  %tmp413 = icmp ne i32 %tmp412, 0
+  br i1 %tmp413, label %IF161, label %ENDIF160
 
 LOOP:                                             ; preds = %ENDIF139, %IF137
-  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %tmp443, %ENDIF139 ]
   %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
-  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
-  %418 = bitcast float %temp96.0 to i32
-  %419 = icmp sge i32 %418, %137
-  %420 = sext i1 %419 to i32
-  %421 = bitcast i32 %420 to float
-  %422 = bitcast float %421 to i32
-  %423 = icmp ne i32 %422, 0
-  br i1 %423, label %IF140, label %ENDIF139
+  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %tmp475, %ENDIF139 ]
+  %tmp414 = bitcast float %temp96.0 to i32
+  %tmp415 = icmp sge i32 %tmp414, %tmp137
+  %tmp416 = sext i1 %tmp415 to i32
+  %tmp417 = bitcast i32 %tmp416 to float
+  %tmp418 = bitcast float %tmp417 to i32
+  %tmp419 = icmp ne i32 %tmp418, 0
+  br i1 %tmp419, label %IF140, label %ENDIF139
 
 IF140:                                            ; preds = %LOOP
-  %424 = fmul float %133, 5.000000e-01
-  %425 = fmul float %129, %temp92.0
-  %426 = fadd float %425, %22
-  %427 = fmul float %130, %temp92.0
-  %428 = fadd float %427, %23
-  %429 = insertelement <4 x float> undef, float %426, i32 0
-  %430 = insertelement <4 x float> %429, float %428, i32 1
-  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
-  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
-  %433 = extractelement <4 x float> %432, i32 0
-  %434 = extractelement <4 x float> %432, i32 1
-  %435 = insertelement <4 x float> undef, float %433, i32 0
-  %436 = insertelement <4 x float> %435, float %434, i32 1
-  %437 = insertelement <4 x float> %436, float undef, i32 2
-  %438 = insertelement <4 x float> %437, float undef, i32 3
-  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
-  %440 = extractelement <4 x float> %439, i32 3
-  %441 = fcmp oge float %temp92.0, %440
-  %442 = sext i1 %441 to i32
-  %443 = bitcast i32 %442 to float
-  %444 = bitcast float %443 to i32
-  %445 = icmp ne i32 %444, 0
-  br i1 %445, label %IF146, label %ENDIF145
+  %tmp420 = fmul float %tmp133, 5.000000e-01
+  %tmp421 = fmul float %tmp129, %temp92.0
+  %tmp422 = fadd float %tmp421, %tmp22
+  %tmp423 = fmul float %tmp130, %temp92.0
+  %tmp424 = fadd float %tmp423, %tmp23
+  %tmp425 = insertelement <4 x float> undef, float %tmp422, i32 0
+  %tmp426 = insertelement <4 x float> %tmp425, float %tmp424, i32 1
+  %tmp427 = insertelement <4 x float> %tmp426, float 0.000000e+00, i32 2
+  %tmp428 = insertelement <4 x float> %tmp427, float 0.000000e+00, i32 3
+  %tmp429 = extractelement <4 x float> %tmp428, i32 0
+  %tmp430 = extractelement <4 x float> %tmp428, i32 1
+  %tmp431 = insertelement <4 x float> undef, float %tmp429, i32 0
+  %tmp432 = insertelement <4 x float> %tmp431, float %tmp430, i32 1
+  %tmp433 = insertelement <4 x float> %tmp432, float undef, i32 2
+  %tmp434 = insertelement <4 x float> %tmp433, float undef, i32 3
+  %tmp435 = shufflevector <4 x float> %tmp434, <4 x float> %tmp434, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp436 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp435, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp437 = extractelement <4 x float> %tmp436, i32 3
+  %tmp438 = fcmp oge float %temp92.0, %tmp437
+  %tmp439 = sext i1 %tmp438 to i32
+  %tmp440 = bitcast i32 %tmp439 to float
+  %tmp441 = bitcast float %tmp440 to i32
+  %tmp442 = icmp ne i32 %tmp441, 0
+  br i1 %tmp442, label %IF146, label %ENDIF145
 
 ENDIF139:                                         ; preds = %LOOP
-  %446 = fadd float %temp88.0, %133
-  %447 = fmul float %129, %446
-  %448 = fadd float %447, %22
-  %449 = fmul float %130, %446
-  %450 = fadd float %449, %23
-  %451 = insertelement <4 x float> undef, float %448, i32 0
-  %452 = insertelement <4 x float> %451, float %450, i32 1
-  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
-  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
-  %455 = extractelement <4 x float> %454, i32 0
-  %456 = extractelement <4 x float> %454, i32 1
-  %457 = insertelement <4 x float> undef, float %455, i32 0
-  %458 = insertelement <4 x float> %457, float %456, i32 1
-  %459 = insertelement <4 x float> %458, float undef, i32 2
-  %460 = insertelement <4 x float> %459, float undef, i32 3
-  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
-  %462 = extractelement <4 x float> %461, i32 3
-  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
-  %464 = sext i1 %463 to i32
-  %465 = bitcast i32 %464 to float
-  %466 = fcmp oge float %446, %462
-  %467 = sext i1 %466 to i32
-  %468 = bitcast i32 %467 to float
-  %469 = bitcast float %465 to i32
-  %470 = bitcast float %468 to i32
-  %471 = and i32 %469, %470
-  %472 = bitcast i32 %471 to float
-  %473 = bitcast float %472 to i32
-  %474 = icmp ne i32 %473, 0
-  %.temp92.0 = select i1 %474, float %446, float %temp92.0
-  %475 = bitcast float %temp96.0 to i32
-  %476 = add i32 %475, 1
-  %477 = bitcast i32 %476 to float
+  %tmp443 = fadd float %temp88.0, %tmp133
+  %tmp444 = fmul float %tmp129, %tmp443
+  %tmp445 = fadd float %tmp444, %tmp22
+  %tmp446 = fmul float %tmp130, %tmp443
+  %tmp447 = fadd float %tmp446, %tmp23
+  %tmp448 = insertelement <4 x float> undef, float %tmp445, i32 0
+  %tmp449 = insertelement <4 x float> %tmp448, float %tmp447, i32 1
+  %tmp450 = insertelement <4 x float> %tmp449, float 0.000000e+00, i32 2
+  %tmp451 = insertelement <4 x float> %tmp450, float 0.000000e+00, i32 3
+  %tmp452 = extractelement <4 x float> %tmp451, i32 0
+  %tmp453 = extractelement <4 x float> %tmp451, i32 1
+  %tmp454 = insertelement <4 x float> undef, float %tmp452, i32 0
+  %tmp455 = insertelement <4 x float> %tmp454, float %tmp453, i32 1
+  %tmp456 = insertelement <4 x float> %tmp455, float undef, i32 2
+  %tmp457 = insertelement <4 x float> %tmp456, float undef, i32 3
+  %tmp458 = shufflevector <4 x float> %tmp457, <4 x float> %tmp457, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp459 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp458, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp460 = extractelement <4 x float> %tmp459, i32 3
+  %tmp461 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+  %tmp462 = sext i1 %tmp461 to i32
+  %tmp463 = bitcast i32 %tmp462 to float
+  %tmp464 = fcmp oge float %tmp443, %tmp460
+  %tmp465 = sext i1 %tmp464 to i32
+  %tmp466 = bitcast i32 %tmp465 to float
+  %tmp467 = bitcast float %tmp463 to i32
+  %tmp468 = bitcast float %tmp466 to i32
+  %tmp469 = and i32 %tmp467, %tmp468
+  %tmp470 = bitcast i32 %tmp469 to float
+  %tmp471 = bitcast float %tmp470 to i32
+  %tmp472 = icmp ne i32 %tmp471, 0
+  %.temp92.0 = select i1 %tmp472, float %tmp443, float %temp92.0
+  %tmp473 = bitcast float %temp96.0 to i32
+  %tmp474 = add i32 %tmp473, 1
+  %tmp475 = bitcast i32 %tmp474 to float
   br label %LOOP
 
 IF146:                                            ; preds = %IF140
-  %478 = fmul float 2.000000e+00, %424
-  %479 = fsub float -0.000000e+00, %478
-  %480 = fadd float %temp92.0, %479
+  %tmp476 = fmul float 2.000000e+00, %tmp420
+  %tmp477 = fsub float -0.000000e+00, %tmp476
+  %tmp478 = fadd float %temp92.0, %tmp477
   br label %ENDIF145
 
-ENDIF145:                                         ; preds = %IF140, %IF146
-  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
-  %481 = fadd float %temp88.1, %424
-  %482 = fmul float %424, 5.000000e-01
-  %483 = fmul float %129, %481
-  %484 = fadd float %483, %22
-  %485 = fmul float %130, %481
-  %486 = fadd float %485, %23
-  %487 = insertelement <4 x float> undef, float %484, i32 0
-  %488 = insertelement <4 x float> %487, float %486, i32 1
-  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
-  %490 = insertelement <4 x float> %489, float %440, i32 3
-  %491 = extractelement <4 x float> %490, i32 0
-  %492 = extractelement <4 x float> %490, i32 1
-  %493 = insertelement <4 x float> undef, float %491, i32 0
-  %494 = insertelement <4 x float> %493, float %492, i32 1
-  %495 = insertelement <4 x float> %494, float undef, i32 2
-  %496 = insertelement <4 x float> %495, float undef, i32 3
-  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
-  %498 = extractelement <4 x float> %497, i32 3
-  %499 = fcmp oge float %481, %498
-  %500 = sext i1 %499 to i32
-  %501 = bitcast i32 %500 to float
-  %502 = bitcast float %501 to i32
-  %503 = icmp ne i32 %502, 0
-  br i1 %503, label %IF149, label %ENDIF148
+ENDIF145:                                         ; preds = %IF146, %IF140
+  %temp88.1 = phi float [ %tmp478, %IF146 ], [ %temp92.0, %IF140 ]
+  %tmp479 = fadd float %temp88.1, %tmp420
+  %tmp480 = fmul float %tmp420, 5.000000e-01
+  %tmp481 = fmul float %tmp129, %tmp479
+  %tmp482 = fadd float %tmp481, %tmp22
+  %tmp483 = fmul float %tmp130, %tmp479
+  %tmp484 = fadd float %tmp483, %tmp23
+  %tmp485 = insertelement <4 x float> undef, float %tmp482, i32 0
+  %tmp486 = insertelement <4 x float> %tmp485, float %tmp484, i32 1
+  %tmp487 = insertelement <4 x float> %tmp486, float 0.000000e+00, i32 2
+  %tmp488 = insertelement <4 x float> %tmp487, float %tmp437, i32 3
+  %tmp489 = extractelement <4 x float> %tmp488, i32 0
+  %tmp490 = extractelement <4 x float> %tmp488, i32 1
+  %tmp491 = insertelement <4 x float> undef, float %tmp489, i32 0
+  %tmp492 = insertelement <4 x float> %tmp491, float %tmp490, i32 1
+  %tmp493 = insertelement <4 x float> %tmp492, float undef, i32 2
+  %tmp494 = insertelement <4 x float> %tmp493, float undef, i32 3
+  %tmp495 = shufflevector <4 x float> %tmp494, <4 x float> %tmp494, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp496 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp495, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp497 = extractelement <4 x float> %tmp496, i32 3
+  %tmp498 = fcmp oge float %tmp479, %tmp497
+  %tmp499 = sext i1 %tmp498 to i32
+  %tmp500 = bitcast i32 %tmp499 to float
+  %tmp501 = bitcast float %tmp500 to i32
+  %tmp502 = icmp ne i32 %tmp501, 0
+  br i1 %tmp502, label %IF149, label %ENDIF148
 
 IF149:                                            ; preds = %ENDIF145
-  %504 = fmul float 2.000000e+00, %482
-  %505 = fsub float -0.000000e+00, %504
-  %506 = fadd float %481, %505
+  %tmp503 = fmul float 2.000000e+00, %tmp480
+  %tmp504 = fsub float -0.000000e+00, %tmp503
+  %tmp505 = fadd float %tmp479, %tmp504
   br label %ENDIF148
 
-ENDIF148:                                         ; preds = %ENDIF145, %IF149
-  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
-  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
-  %507 = fadd float %temp88.2, %482
-  %508 = fmul float %482, 5.000000e-01
-  %509 = fmul float %129, %507
-  %510 = fadd float %509, %22
-  %511 = fmul float %130, %507
-  %512 = fadd float %511, %23
-  %513 = insertelement <4 x float> undef, float %510, i32 0
-  %514 = insertelement <4 x float> %513, float %512, i32 1
-  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
-  %516 = insertelement <4 x float> %515, float %498, i32 3
-  %517 = extractelement <4 x float> %516, i32 0
-  %518 = extractelement <4 x float> %516, i32 1
-  %519 = insertelement <4 x float> undef, float %517, i32 0
-  %520 = insertelement <4 x float> %519, float %518, i32 1
-  %521 = insertelement <4 x float> %520, float undef, i32 2
-  %522 = insertelement <4 x float> %521, float undef, i32 3
-  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
-  %524 = extractelement <4 x float> %523, i32 3
-  %525 = fcmp oge float %507, %524
-  %526 = sext i1 %525 to i32
-  %527 = bitcast i32 %526 to float
-  %528 = bitcast float %527 to i32
-  %529 = icmp ne i32 %528, 0
-  br i1 %529, label %IF152, label %ENDIF151
+ENDIF148:                                         ; preds = %IF149, %ENDIF145
+  %temp88.2 = phi float [ %tmp505, %IF149 ], [ %tmp479, %ENDIF145 ]
+  %temp92.2 = phi float [ %tmp479, %IF149 ], [ %temp92.0, %ENDIF145 ]
+  %tmp506 = fadd float %temp88.2, %tmp480
+  %tmp507 = fmul float %tmp480, 5.000000e-01
+  %tmp508 = fmul float %tmp129, %tmp506
+  %tmp509 = fadd float %tmp508, %tmp22
+  %tmp510 = fmul float %tmp130, %tmp506
+  %tmp511 = fadd float %tmp510, %tmp23
+  %tmp512 = insertelement <4 x float> undef, float %tmp509, i32 0
+  %tmp513 = insertelement <4 x float> %tmp512, float %tmp511, i32 1
+  %tmp514 = insertelement <4 x float> %tmp513, float 0.000000e+00, i32 2
+  %tmp515 = insertelement <4 x float> %tmp514, float %tmp497, i32 3
+  %tmp516 = extractelement <4 x float> %tmp515, i32 0
+  %tmp517 = extractelement <4 x float> %tmp515, i32 1
+  %tmp518 = insertelement <4 x float> undef, float %tmp516, i32 0
+  %tmp519 = insertelement <4 x float> %tmp518, float %tmp517, i32 1
+  %tmp520 = insertelement <4 x float> %tmp519, float undef, i32 2
+  %tmp521 = insertelement <4 x float> %tmp520, float undef, i32 3
+  %tmp522 = shufflevector <4 x float> %tmp521, <4 x float> %tmp521, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp523 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp522, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp524 = extractelement <4 x float> %tmp523, i32 3
+  %tmp525 = fcmp oge float %tmp506, %tmp524
+  %tmp526 = sext i1 %tmp525 to i32
+  %tmp527 = bitcast i32 %tmp526 to float
+  %tmp528 = bitcast float %tmp527 to i32
+  %tmp529 = icmp ne i32 %tmp528, 0
+  br i1 %tmp529, label %IF152, label %ENDIF151
 
 IF152:                                            ; preds = %ENDIF148
-  %530 = fmul float 2.000000e+00, %508
-  %531 = fsub float -0.000000e+00, %530
-  %532 = fadd float %507, %531
+  %tmp530 = fmul float 2.000000e+00, %tmp507
+  %tmp531 = fsub float -0.000000e+00, %tmp530
+  %tmp532 = fadd float %tmp506, %tmp531
   br label %ENDIF151
 
-ENDIF151:                                         ; preds = %ENDIF148, %IF152
-  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
-  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
-  %533 = fadd float %temp88.3, %508
-  %534 = fmul float %508, 5.000000e-01
-  %535 = fmul float %129, %533
-  %536 = fadd float %535, %22
-  %537 = fmul float %130, %533
-  %538 = fadd float %537, %23
-  %539 = insertelement <4 x float> undef, float %536, i32 0
-  %540 = insertelement <4 x float> %539, float %538, i32 1
-  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
-  %542 = insertelement <4 x float> %541, float %524, i32 3
-  %543 = extractelement <4 x float> %542, i32 0
-  %544 = extractelement <4 x float> %542, i32 1
-  %545 = insertelement <4 x float> undef, float %543, i32 0
-  %546 = insertelement <4 x float> %545, float %544, i32 1
-  %547 = insertelement <4 x float> %546, float undef, i32 2
-  %548 = insertelement <4 x float> %547, float undef, i32 3
-  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
-  %550 = extractelement <4 x float> %549, i32 3
-  %551 = fcmp oge float %533, %550
-  %552 = sext i1 %551 to i32
-  %553 = bitcast i32 %552 to float
-  %554 = bitcast float %553 to i32
-  %555 = icmp ne i32 %554, 0
-  br i1 %555, label %IF155, label %ENDIF154
+ENDIF151:                                         ; preds = %IF152, %ENDIF148
+  %temp88.3 = phi float [ %tmp532, %IF152 ], [ %tmp506, %ENDIF148 ]
+  %temp92.3 = phi float [ %tmp506, %IF152 ], [ %temp92.2, %ENDIF148 ]
+  %tmp533 = fadd float %temp88.3, %tmp507
+  %tmp534 = fmul float %tmp507, 5.000000e-01
+  %tmp535 = fmul float %tmp129, %tmp533
+  %tmp536 = fadd float %tmp535, %tmp22
+  %tmp537 = fmul float %tmp130, %tmp533
+  %tmp538 = fadd float %tmp537, %tmp23
+  %tmp539 = insertelement <4 x float> undef, float %tmp536, i32 0
+  %tmp540 = insertelement <4 x float> %tmp539, float %tmp538, i32 1
+  %tmp541 = insertelement <4 x float> %tmp540, float 0.000000e+00, i32 2
+  %tmp542 = insertelement <4 x float> %tmp541, float %tmp524, i32 3
+  %tmp543 = extractelement <4 x float> %tmp542, i32 0
+  %tmp544 = extractelement <4 x float> %tmp542, i32 1
+  %tmp545 = insertelement <4 x float> undef, float %tmp543, i32 0
+  %tmp546 = insertelement <4 x float> %tmp545, float %tmp544, i32 1
+  %tmp547 = insertelement <4 x float> %tmp546, float undef, i32 2
+  %tmp548 = insertelement <4 x float> %tmp547, float undef, i32 3
+  %tmp549 = shufflevector <4 x float> %tmp548, <4 x float> %tmp548, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp550 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp549, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp551 = extractelement <4 x float> %tmp550, i32 3
+  %tmp552 = fcmp oge float %tmp533, %tmp551
+  %tmp553 = sext i1 %tmp552 to i32
+  %tmp554 = bitcast i32 %tmp553 to float
+  %tmp555 = bitcast float %tmp554 to i32
+  %tmp556 = icmp ne i32 %tmp555, 0
+  br i1 %tmp556, label %IF155, label %ENDIF154
 
 IF155:                                            ; preds = %ENDIF151
-  %556 = fmul float 2.000000e+00, %534
-  %557 = fsub float -0.000000e+00, %556
-  %558 = fadd float %533, %557
+  %tmp557 = fmul float 2.000000e+00, %tmp534
+  %tmp558 = fsub float -0.000000e+00, %tmp557
+  %tmp559 = fadd float %tmp533, %tmp558
   br label %ENDIF154
 
-ENDIF154:                                         ; preds = %ENDIF151, %IF155
-  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
-  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
-  %559 = fadd float %temp88.4, %534
-  %560 = fmul float %129, %559
-  %561 = fadd float %560, %22
-  %562 = fmul float %130, %559
-  %563 = fadd float %562, %23
-  %564 = insertelement <4 x float> undef, float %561, i32 0
-  %565 = insertelement <4 x float> %564, float %563, i32 1
-  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
-  %567 = insertelement <4 x float> %566, float %550, i32 3
-  %568 = extractelement <4 x float> %567, i32 0
-  %569 = extractelement <4 x float> %567, i32 1
-  %570 = insertelement <4 x float> undef, float %568, i32 0
-  %571 = insertelement <4 x float> %570, float %569, i32 1
-  %572 = insertelement <4 x float> %571, float undef, i32 2
-  %573 = insertelement <4 x float> %572, float undef, i32 3
-  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
-  %575 = extractelement <4 x float> %574, i32 3
-  %576 = fcmp oge float %559, %575
-  %577 = sext i1 %576 to i32
-  %578 = bitcast i32 %577 to float
-  %579 = bitcast float %578 to i32
-  %580 = icmp ne i32 %579, 0
-  %.temp92.4 = select i1 %580, float %559, float %temp92.4
-  %581 = fmul float %129, %.temp92.4
-  %582 = fadd float %581, %22
-  %583 = fmul float %130, %.temp92.4
-  %584 = fadd float %583, %23
-  %585 = insertelement <4 x float> undef, float %582, i32 0
-  %586 = insertelement <4 x float> %585, float %584, i32 1
-  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
-  %588 = insertelement <4 x float> %587, float %575, i32 3
-  %589 = extractelement <4 x float> %588, i32 0
-  %590 = extractelement <4 x float> %588, i32 1
-  %591 = insertelement <4 x float> undef, float %589, i32 0
-  %592 = insertelement <4 x float> %591, float %590, i32 1
-  %593 = insertelement <4 x float> %592, float undef, i32 2
-  %594 = insertelement <4 x float> %593, float undef, i32 3
-  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
-  %596 = extractelement <4 x float> %595, i32 0
-  %597 = extractelement <4 x float> %595, i32 1
-  %598 = extractelement <4 x float> %595, i32 2
-  %599 = fmul float %596, 2.000000e+00
-  %600 = fadd float %599, -1.000000e+00
-  %601 = fmul float %597, 2.000000e+00
-  %602 = fadd float %601, -1.000000e+00
-  %603 = fmul float %598, 2.000000e+00
-  %604 = fadd float %603, -1.000000e+00
+ENDIF154:                                         ; preds = %IF155, %ENDIF151
+  %temp88.4 = phi float [ %tmp559, %IF155 ], [ %tmp533, %ENDIF151 ]
+  %temp92.4 = phi float [ %tmp533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+  %tmp560 = fadd float %temp88.4, %tmp534
+  %tmp561 = fmul float %tmp129, %tmp560
+  %tmp562 = fadd float %tmp561, %tmp22
+  %tmp563 = fmul float %tmp130, %tmp560
+  %tmp564 = fadd float %tmp563, %tmp23
+  %tmp565 = insertelement <4 x float> undef, float %tmp562, i32 0
+  %tmp566 = insertelement <4 x float> %tmp565, float %tmp564, i32 1
+  %tmp567 = insertelement <4 x float> %tmp566, float 0.000000e+00, i32 2
+  %tmp568 = insertelement <4 x float> %tmp567, float %tmp551, i32 3
+  %tmp569 = extractelement <4 x float> %tmp568, i32 0
+  %tmp570 = extractelement <4 x float> %tmp568, i32 1
+  %tmp571 = insertelement <4 x float> undef, float %tmp569, i32 0
+  %tmp572 = insertelement <4 x float> %tmp571, float %tmp570, i32 1
+  %tmp573 = insertelement <4 x float> %tmp572, float undef, i32 2
+  %tmp574 = insertelement <4 x float> %tmp573, float undef, i32 3
+  %tmp575 = shufflevector <4 x float> %tmp574, <4 x float> %tmp574, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp576 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp575, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp577 = extractelement <4 x float> %tmp576, i32 3
+  %tmp578 = fcmp oge float %tmp560, %tmp577
+  %tmp579 = sext i1 %tmp578 to i32
+  %tmp580 = bitcast i32 %tmp579 to float
+  %tmp581 = bitcast float %tmp580 to i32
+  %tmp582 = icmp ne i32 %tmp581, 0
+  %.temp92.4 = select i1 %tmp582, float %tmp560, float %temp92.4
+  %tmp583 = fmul float %tmp129, %.temp92.4
+  %tmp584 = fadd float %tmp583, %tmp22
+  %tmp585 = fmul float %tmp130, %.temp92.4
+  %tmp586 = fadd float %tmp585, %tmp23
+  %tmp587 = insertelement <4 x float> undef, float %tmp584, i32 0
+  %tmp588 = insertelement <4 x float> %tmp587, float %tmp586, i32 1
+  %tmp589 = insertelement <4 x float> %tmp588, float 0.000000e+00, i32 2
+  %tmp590 = insertelement <4 x float> %tmp589, float %tmp577, i32 3
+  %tmp591 = extractelement <4 x float> %tmp590, i32 0
+  %tmp592 = extractelement <4 x float> %tmp590, i32 1
+  %tmp593 = insertelement <4 x float> undef, float %tmp591, i32 0
+  %tmp594 = insertelement <4 x float> %tmp593, float %tmp592, i32 1
+  %tmp595 = insertelement <4 x float> %tmp594, float undef, i32 2
+  %tmp596 = insertelement <4 x float> %tmp595, float undef, i32 3
+  %tmp597 = shufflevector <4 x float> %tmp596, <4 x float> %tmp596, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp598 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp597, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+  %tmp599 = extractelement <4 x float> %tmp598, i32 0
+  %tmp600 = extractelement <4 x float> %tmp598, i32 1
+  %tmp601 = extractelement <4 x float> %tmp598, i32 2
+  %tmp602 = fmul float %tmp599, 2.000000e+00
+  %tmp603 = fadd float %tmp602, -1.000000e+00
+  %tmp604 = fmul float %tmp600, 2.000000e+00
+  %tmp605 = fadd float %tmp604, -1.000000e+00
+  %tmp606 = fmul float %tmp601, 2.000000e+00
+  %tmp607 = fadd float %tmp606, -1.000000e+00
   br label %ENDIF136
 
 IF161:                                            ; preds = %ENDIF136
-  %605 = fmul float %202, 0x3FB99999A0000000
-  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
-  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
-  %608 = fcmp uge float %607, 5.000000e-01
-  %609 = select i1 %608, float 5.000000e-01, float %607
-  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
-  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
-  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
-  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
-  %614 = insertelement <4 x float> undef, float %329, i32 0
-  %615 = insertelement <4 x float> %614, float %330, i32 1
-  %616 = insertelement <4 x float> %615, float %331, i32 2
-  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
-  %618 = insertelement <4 x float> undef, float %63, i32 0
-  %619 = insertelement <4 x float> %618, float %65, i32 1
-  %620 = insertelement <4 x float> %619, float %67, i32 2
-  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
-  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
-  %623 = fcmp uge float 0x3FE6666660000000, %622
-  %624 = select i1 %623, float 0x3FE6666660000000, float %622
-  %625 = fmul float %8, %624
-  %626 = fmul float %13, %624
-  %627 = fmul float %18, %624
-  %628 = insertelement <4 x float> undef, float %34, i32 0
-  %629 = insertelement <4 x float> %628, float %35, i32 1
-  %630 = insertelement <4 x float> %629, float %36, i32 2
-  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
-  %632 = insertelement <4 x float> undef, float %63, i32 0
-  %633 = insertelement <4 x float> %632, float %65, i32 1
-  %634 = insertelement <4 x float> %633, float %67, i32 2
-  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
-  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
-  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
-  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
-  %639 = fmul float %625, %638
-  %640 = fmul float %626, %638
-  %641 = fmul float %627, %638
+  %tmp608 = fmul float %result.i, 0x3FB99999A0000000
+  %tmp609 = fcmp uge float 0x3FE4CCCCC0000000, %tmp608
+  %tmp610 = select i1 %tmp609, float 0x3FE4CCCCC0000000, float %tmp608
+  %tmp611 = fcmp uge float %tmp610, 5.000000e-01
+  %tmp612 = select i1 %tmp611, float 5.000000e-01, float %tmp610
+  %one.sub.a.i137 = fsub float 1.000000e+00, %tmp612
+  %one.sub.ac.i138 = fmul float %one.sub.a.i137, %tmp304
+  %mul.i139 = fmul float %result.i172, %tmp304
+  %result.i140 = fadd float %mul.i139, %one.sub.ac.i138
+  %one.sub.a.i133 = fsub float 1.000000e+00, %tmp612
+  %one.sub.ac.i134 = fmul float %one.sub.a.i133, %tmp305
+  %mul.i135 = fmul float %result.i168, %tmp305
+  %result.i136 = fadd float %mul.i135, %one.sub.ac.i134
+  %one.sub.a.i129 = fsub float 1.000000e+00, %tmp612
+  %one.sub.ac.i130 = fmul float %one.sub.a.i129, %tmp306
+  %mul.i131 = fmul float %result.i164, %tmp306
+  %result.i132 = fadd float %mul.i131, %one.sub.ac.i130
+  %one.sub.a.i125 = fsub float 1.000000e+00, %tmp612
+  %one.sub.ac.i126 = fmul float %one.sub.a.i125, %tmp307
+  %mul.i127 = fmul float %result.i160, %tmp307
+  %result.i128 = fadd float %mul.i127, %one.sub.ac.i126
+  %tmp613 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp614 = insertelement <4 x float> %tmp613, float %tmp334, i32 1
+  %tmp615 = insertelement <4 x float> %tmp614, float %tmp335, i32 2
+  %tmp616 = insertelement <4 x float> %tmp615, float 0.000000e+00, i32 3
+  %tmp617 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp618 = insertelement <4 x float> %tmp617, float %tmp65, i32 1
+  %tmp619 = insertelement <4 x float> %tmp618, float %tmp67, i32 2
+  %tmp620 = insertelement <4 x float> %tmp619, float 0.000000e+00, i32 3
+  %tmp621 = call float @llvm.r600.dot4(<4 x float> %tmp616, <4 x float> %tmp620)
+  %tmp622 = fcmp uge float 0x3FE6666660000000, %tmp621
+  %tmp623 = select i1 %tmp622, float 0x3FE6666660000000, float %tmp621
+  %tmp624 = fmul float %tmp8, %tmp623
+  %tmp625 = fmul float %tmp13, %tmp623
+  %tmp626 = fmul float %tmp18, %tmp623
+  %tmp627 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp628 = insertelement <4 x float> %tmp627, float %tmp35, i32 1
+  %tmp629 = insertelement <4 x float> %tmp628, float %tmp36, i32 2
+  %tmp630 = insertelement <4 x float> %tmp629, float 0.000000e+00, i32 3
+  %tmp631 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp632 = insertelement <4 x float> %tmp631, float %tmp65, i32 1
+  %tmp633 = insertelement <4 x float> %tmp632, float %tmp67, i32 2
+  %tmp634 = insertelement <4 x float> %tmp633, float 0.000000e+00, i32 3
+  %tmp635 = call float @llvm.r600.dot4(<4 x float> %tmp630, <4 x float> %tmp634)
+  %tmp636 = fcmp uge float 0x3FECCCCCC0000000, %tmp635
+  %tmp637 = select i1 %tmp636, float 0x3FECCCCCC0000000, float %tmp635
+  %tmp638 = fmul float %tmp624, %tmp637
+  %tmp639 = fmul float %tmp625, %tmp637
+  %tmp640 = fmul float %tmp626, %tmp637
   br label %ENDIF160
 
-ENDIF160:                                         ; preds = %ENDIF136, %IF161
-  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
-  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
-  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
-  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
-  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
-  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
-  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
-  %642 = fcmp olt float 2.200000e+03, %179
-  %643 = sext i1 %642 to i32
-  %644 = bitcast i32 %643 to float
-  %645 = fcmp olt float %179, 2.300000e+03
-  %646 = sext i1 %645 to i32
-  %647 = bitcast i32 %646 to float
-  %648 = bitcast float %644 to i32
-  %649 = bitcast float %647 to i32
-  %650 = and i32 %648, %649
-  %651 = bitcast i32 %650 to float
-  %652 = bitcast float %651 to i32
-  %653 = icmp ne i32 %652, 0
-  br i1 %653, label %IF164, label %ENDIF163
+ENDIF160:                                         ; preds = %IF161, %ENDIF136
+  %temp84.0 = phi float [ %result.i140, %IF161 ], [ %tmp258, %ENDIF136 ]
+  %temp85.0 = phi float [ %result.i136, %IF161 ], [ %tmp259, %ENDIF136 ]
+  %temp86.0 = phi float [ %result.i132, %IF161 ], [ %tmp260, %ENDIF136 ]
+  %temp87.0 = phi float [ %result.i128, %IF161 ], [ %tmp261, %ENDIF136 ]
+  %temp92.6 = phi float [ %tmp638, %IF161 ], [ %tmp411, %ENDIF136 ]
+  %temp93.0 = phi float [ %tmp639, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %temp94.0 = phi float [ %tmp640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %tmp641 = fcmp olt float 2.200000e+03, %tmp181
+  %tmp642 = sext i1 %tmp641 to i32
+  %tmp643 = bitcast i32 %tmp642 to float
+  %tmp644 = fcmp olt float %tmp181, 2.300000e+03
+  %tmp645 = sext i1 %tmp644 to i32
+  %tmp646 = bitcast i32 %tmp645 to float
+  %tmp647 = bitcast float %tmp643 to i32
+  %tmp648 = bitcast float %tmp646 to i32
+  %tmp649 = and i32 %tmp647, %tmp648
+  %tmp650 = bitcast i32 %tmp649 to float
+  %tmp651 = bitcast float %tmp650 to i32
+  %tmp652 = icmp ne i32 %tmp651, 0
+  br i1 %tmp652, label %IF164, label %ENDIF163
 
 IF164:                                            ; preds = %ENDIF160
-  %654 = fmul float %202, 5.000000e-01
-  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
-  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
-  %657 = fcmp uge float %656, 0x3FD6666660000000
-  %658 = select i1 %657, float 0x3FD6666660000000, float %656
-  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
-  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
-  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
-  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
-  %663 = insertelement <4 x float> undef, float %329, i32 0
-  %664 = insertelement <4 x float> %663, float %330, i32 1
-  %665 = insertelement <4 x float> %664, float %331, i32 2
-  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
-  %667 = insertelement <4 x float> undef, float %63, i32 0
-  %668 = insertelement <4 x float> %667, float %65, i32 1
-  %669 = insertelement <4 x float> %668, float %67, i32 2
-  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
-  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
-  %672 = fcmp uge float 0x3FE6666660000000, %671
-  %673 = select i1 %672, float 0x3FE6666660000000, float %671
-  %674 = fmul float %8, %673
-  %675 = fmul float %13, %673
-  %676 = fmul float %18, %673
-  %677 = insertelement <4 x float> undef, float %34, i32 0
-  %678 = insertelement <4 x float> %677, float %35, i32 1
-  %679 = insertelement <4 x float> %678, float %36, i32 2
-  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
-  %681 = insertelement <4 x float> undef, float %63, i32 0
-  %682 = insertelement <4 x float> %681, float %65, i32 1
-  %683 = insertelement <4 x float> %682, float %67, i32 2
-  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
-  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
-  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
-  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
-  %688 = fmul float %674, %687
-  %689 = fmul float %675, %687
-  %690 = fmul float %676, %687
+  %tmp653 = fmul float %result.i, 5.000000e-01
+  %tmp654 = fcmp uge float 0x3FE4CCCCC0000000, %tmp653
+  %tmp655 = select i1 %tmp654, float 0x3FE4CCCCC0000000, float %tmp653
+  %tmp656 = fcmp uge float %tmp655, 0x3FD6666660000000
+  %tmp657 = select i1 %tmp656, float 0x3FD6666660000000, float %tmp655
+  %one.sub.a.i121 = fsub float 1.000000e+00, %tmp657
+  %one.sub.ac.i122 = fmul float %one.sub.a.i121, %tmp304
+  %mul.i123 = fmul float %result.i172, %tmp304
+  %result.i124 = fadd float %mul.i123, %one.sub.ac.i122
+  %one.sub.a.i117 = fsub float 1.000000e+00, %tmp657
+  %one.sub.ac.i118 = fmul float %one.sub.a.i117, %tmp305
+  %mul.i119 = fmul float %result.i168, %tmp305
+  %result.i120 = fadd float %mul.i119, %one.sub.ac.i118
+  %one.sub.a.i113 = fsub float 1.000000e+00, %tmp657
+  %one.sub.ac.i114 = fmul float %one.sub.a.i113, %tmp306
+  %mul.i115 = fmul float %result.i164, %tmp306
+  %result.i116 = fadd float %mul.i115, %one.sub.ac.i114
+  %one.sub.a.i109 = fsub float 1.000000e+00, %tmp657
+  %one.sub.ac.i110 = fmul float %one.sub.a.i109, %tmp307
+  %mul.i111 = fmul float %result.i160, %tmp307
+  %result.i112 = fadd float %mul.i111, %one.sub.ac.i110
+  %tmp658 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp659 = insertelement <4 x float> %tmp658, float %tmp334, i32 1
+  %tmp660 = insertelement <4 x float> %tmp659, float %tmp335, i32 2
+  %tmp661 = insertelement <4 x float> %tmp660, float 0.000000e+00, i32 3
+  %tmp662 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp663 = insertelement <4 x float> %tmp662, float %tmp65, i32 1
+  %tmp664 = insertelement <4 x float> %tmp663, float %tmp67, i32 2
+  %tmp665 = insertelement <4 x float> %tmp664, float 0.000000e+00, i32 3
+  %tmp666 = call float @llvm.r600.dot4(<4 x float> %tmp661, <4 x float> %tmp665)
+  %tmp667 = fcmp uge float 0x3FE6666660000000, %tmp666
+  %tmp668 = select i1 %tmp667, float 0x3FE6666660000000, float %tmp666
+  %tmp669 = fmul float %tmp8, %tmp668
+  %tmp670 = fmul float %tmp13, %tmp668
+  %tmp671 = fmul float %tmp18, %tmp668
+  %tmp672 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp673 = insertelement <4 x float> %tmp672, float %tmp35, i32 1
+  %tmp674 = insertelement <4 x float> %tmp673, float %tmp36, i32 2
+  %tmp675 = insertelement <4 x float> %tmp674, float 0.000000e+00, i32 3
+  %tmp676 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp677 = insertelement <4 x float> %tmp676, float %tmp65, i32 1
+  %tmp678 = insertelement <4 x float> %tmp677, float %tmp67, i32 2
+  %tmp679 = insertelement <4 x float> %tmp678, float 0.000000e+00, i32 3
+  %tmp680 = call float @llvm.r600.dot4(<4 x float> %tmp675, <4 x float> %tmp679)
+  %tmp681 = fcmp uge float 0x3FECCCCCC0000000, %tmp680
+  %tmp682 = select i1 %tmp681, float 0x3FECCCCCC0000000, float %tmp680
+  %tmp683 = fmul float %tmp669, %tmp682
+  %tmp684 = fmul float %tmp670, %tmp682
+  %tmp685 = fmul float %tmp671, %tmp682
   br label %ENDIF163
 
-ENDIF163:                                         ; preds = %ENDIF160, %IF164
-  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
-  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
-  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
-  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
-  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
-  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
-  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
-  %691 = fcmp oge float %179, 2.300000e+03
-  %692 = sext i1 %691 to i32
-  %693 = bitcast i32 %692 to float
-  %694 = fcmp olt float %179, 2.480000e+03
-  %695 = sext i1 %694 to i32
-  %696 = bitcast i32 %695 to float
-  %697 = bitcast float %693 to i32
-  %698 = bitcast float %696 to i32
-  %699 = and i32 %697, %698
-  %700 = bitcast i32 %699 to float
-  %701 = bitcast float %700 to i32
-  %702 = icmp ne i32 %701, 0
-  br i1 %702, label %IF167, label %ENDIF166
+ENDIF163:                                         ; preds = %IF164, %ENDIF160
+  %temp84.1 = phi float [ %result.i124, %IF164 ], [ %temp84.0, %ENDIF160 ]
+  %temp85.1 = phi float [ %result.i120, %IF164 ], [ %temp85.0, %ENDIF160 ]
+  %temp86.1 = phi float [ %result.i116, %IF164 ], [ %temp86.0, %ENDIF160 ]
+  %temp87.1 = phi float [ %result.i112, %IF164 ], [ %temp87.0, %ENDIF160 ]
+  %temp92.7 = phi float [ %tmp683, %IF164 ], [ %temp92.6, %ENDIF160 ]
+  %temp93.1 = phi float [ %tmp684, %IF164 ], [ %temp93.0, %ENDIF160 ]
+  %temp94.1 = phi float [ %tmp685, %IF164 ], [ %temp94.0, %ENDIF160 ]
+  %tmp686 = fcmp oge float %tmp181, 2.300000e+03
+  %tmp687 = sext i1 %tmp686 to i32
+  %tmp688 = bitcast i32 %tmp687 to float
+  %tmp689 = fcmp olt float %tmp181, 2.480000e+03
+  %tmp690 = sext i1 %tmp689 to i32
+  %tmp691 = bitcast i32 %tmp690 to float
+  %tmp692 = bitcast float %tmp688 to i32
+  %tmp693 = bitcast float %tmp691 to i32
+  %tmp694 = and i32 %tmp692, %tmp693
+  %tmp695 = bitcast i32 %tmp694 to float
+  %tmp696 = bitcast float %tmp695 to i32
+  %tmp697 = icmp ne i32 %tmp696, 0
+  br i1 %tmp697, label %IF167, label %ENDIF166
 
 IF167:                                            ; preds = %ENDIF163
-  %703 = fmul float %202, 5.000000e-01
-  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
-  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
-  %706 = fcmp uge float %705, 0x3FD3333340000000
-  %707 = select i1 %706, float 0x3FD3333340000000, float %705
-  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
-  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
-  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
-  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
-  %712 = insertelement <4 x float> undef, float %329, i32 0
-  %713 = insertelement <4 x float> %712, float %330, i32 1
-  %714 = insertelement <4 x float> %713, float %331, i32 2
-  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
-  %716 = insertelement <4 x float> undef, float %63, i32 0
-  %717 = insertelement <4 x float> %716, float %65, i32 1
-  %718 = insertelement <4 x float> %717, float %67, i32 2
-  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
-  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
-  %721 = fcmp uge float 0x3FEB333340000000, %720
-  %722 = select i1 %721, float 0x3FEB333340000000, float %720
-  %723 = fmul float %8, %722
-  %724 = fmul float %13, %722
-  %725 = fmul float %18, %722
-  %726 = insertelement <4 x float> undef, float %34, i32 0
-  %727 = insertelement <4 x float> %726, float %35, i32 1
-  %728 = insertelement <4 x float> %727, float %36, i32 2
-  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
-  %730 = insertelement <4 x float> undef, float %63, i32 0
-  %731 = insertelement <4 x float> %730, float %65, i32 1
-  %732 = insertelement <4 x float> %731, float %67, i32 2
-  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
-  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
-  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
-  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
-  %737 = fmul float %723, %736
-  %738 = fmul float %724, %736
-  %739 = fmul float %725, %736
+  %tmp698 = fmul float %result.i, 5.000000e-01
+  %tmp699 = fcmp uge float 0x3FE4CCCCC0000000, %tmp698
+  %tmp700 = select i1 %tmp699, float 0x3FE4CCCCC0000000, float %tmp698
+  %tmp701 = fcmp uge float %tmp700, 0x3FD3333340000000
+  %tmp702 = select i1 %tmp701, float 0x3FD3333340000000, float %tmp700
+  %one.sub.a.i105 = fsub float 1.000000e+00, %tmp702
+  %one.sub.ac.i106 = fmul float %one.sub.a.i105, %tmp304
+  %mul.i107 = fmul float %result.i156, %tmp304
+  %result.i108 = fadd float %mul.i107, %one.sub.ac.i106
+  %one.sub.a.i101 = fsub float 1.000000e+00, %tmp702
+  %one.sub.ac.i102 = fmul float %one.sub.a.i101, %tmp305
+  %mul.i103 = fmul float %result.i152, %tmp305
+  %result.i104 = fadd float %mul.i103, %one.sub.ac.i102
+  %one.sub.a.i97 = fsub float 1.000000e+00, %tmp702
+  %one.sub.ac.i98 = fmul float %one.sub.a.i97, %tmp306
+  %mul.i99 = fmul float %result.i148, %tmp306
+  %result.i100 = fadd float %mul.i99, %one.sub.ac.i98
+  %one.sub.a.i93 = fsub float 1.000000e+00, %tmp702
+  %one.sub.ac.i94 = fmul float %one.sub.a.i93, %tmp307
+  %mul.i95 = fmul float %result.i144, %tmp307
+  %result.i96 = fadd float %mul.i95, %one.sub.ac.i94
+  %tmp703 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp704 = insertelement <4 x float> %tmp703, float %tmp334, i32 1
+  %tmp705 = insertelement <4 x float> %tmp704, float %tmp335, i32 2
+  %tmp706 = insertelement <4 x float> %tmp705, float 0.000000e+00, i32 3
+  %tmp707 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp708 = insertelement <4 x float> %tmp707, float %tmp65, i32 1
+  %tmp709 = insertelement <4 x float> %tmp708, float %tmp67, i32 2
+  %tmp710 = insertelement <4 x float> %tmp709, float 0.000000e+00, i32 3
+  %tmp711 = call float @llvm.r600.dot4(<4 x float> %tmp706, <4 x float> %tmp710)
+  %tmp712 = fcmp uge float 0x3FEB333340000000, %tmp711
+  %tmp713 = select i1 %tmp712, float 0x3FEB333340000000, float %tmp711
+  %tmp714 = fmul float %tmp8, %tmp713
+  %tmp715 = fmul float %tmp13, %tmp713
+  %tmp716 = fmul float %tmp18, %tmp713
+  %tmp717 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp718 = insertelement <4 x float> %tmp717, float %tmp35, i32 1
+  %tmp719 = insertelement <4 x float> %tmp718, float %tmp36, i32 2
+  %tmp720 = insertelement <4 x float> %tmp719, float 0.000000e+00, i32 3
+  %tmp721 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp722 = insertelement <4 x float> %tmp721, float %tmp65, i32 1
+  %tmp723 = insertelement <4 x float> %tmp722, float %tmp67, i32 2
+  %tmp724 = insertelement <4 x float> %tmp723, float 0.000000e+00, i32 3
+  %tmp725 = call float @llvm.r600.dot4(<4 x float> %tmp720, <4 x float> %tmp724)
+  %tmp726 = fcmp uge float 0x3FECCCCCC0000000, %tmp725
+  %tmp727 = select i1 %tmp726, float 0x3FECCCCCC0000000, float %tmp725
+  %tmp728 = fmul float %tmp714, %tmp727
+  %tmp729 = fmul float %tmp715, %tmp727
+  %tmp730 = fmul float %tmp716, %tmp727
   br label %ENDIF166
 
-ENDIF166:                                         ; preds = %ENDIF163, %IF167
-  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
-  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
-  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
-  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
-  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
-  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
-  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
-  %740 = fcmp oge float %179, 2.480000e+03
-  %741 = sext i1 %740 to i32
-  %742 = bitcast i32 %741 to float
-  %743 = fcmp olt float %179, 2.530000e+03
-  %744 = sext i1 %743 to i32
-  %745 = bitcast i32 %744 to float
-  %746 = bitcast float %742 to i32
-  %747 = bitcast float %745 to i32
-  %748 = and i32 %746, %747
-  %749 = bitcast i32 %748 to float
-  %750 = bitcast float %749 to i32
-  %751 = icmp ne i32 %750, 0
-  br i1 %751, label %IF170, label %ENDIF169
+ENDIF166:                                         ; preds = %IF167, %ENDIF163
+  %temp84.2 = phi float [ %result.i108, %IF167 ], [ %temp84.1, %ENDIF163 ]
+  %temp85.2 = phi float [ %result.i104, %IF167 ], [ %temp85.1, %ENDIF163 ]
+  %temp86.2 = phi float [ %result.i100, %IF167 ], [ %temp86.1, %ENDIF163 ]
+  %temp87.2 = phi float [ %result.i96, %IF167 ], [ %temp87.1, %ENDIF163 ]
+  %temp92.8 = phi float [ %tmp728, %IF167 ], [ %temp92.7, %ENDIF163 ]
+  %temp93.2 = phi float [ %tmp729, %IF167 ], [ %temp93.1, %ENDIF163 ]
+  %temp94.2 = phi float [ %tmp730, %IF167 ], [ %temp94.1, %ENDIF163 ]
+  %tmp731 = fcmp oge float %tmp181, 2.480000e+03
+  %tmp732 = sext i1 %tmp731 to i32
+  %tmp733 = bitcast i32 %tmp732 to float
+  %tmp734 = fcmp olt float %tmp181, 2.530000e+03
+  %tmp735 = sext i1 %tmp734 to i32
+  %tmp736 = bitcast i32 %tmp735 to float
+  %tmp737 = bitcast float %tmp733 to i32
+  %tmp738 = bitcast float %tmp736 to i32
+  %tmp739 = and i32 %tmp737, %tmp738
+  %tmp740 = bitcast i32 %tmp739 to float
+  %tmp741 = bitcast float %tmp740 to i32
+  %tmp742 = icmp ne i32 %tmp741, 0
+  br i1 %tmp742, label %IF170, label %ENDIF169
 
 IF170:                                            ; preds = %ENDIF166
-  %752 = fmul float %202, 5.000000e-01
-  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
-  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
-  %755 = fcmp uge float %754, 0x3FC99999A0000000
-  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
-  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
-  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
-  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
-  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
-  %761 = insertelement <4 x float> undef, float %329, i32 0
-  %762 = insertelement <4 x float> %761, float %330, i32 1
-  %763 = insertelement <4 x float> %762, float %331, i32 2
-  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
-  %765 = insertelement <4 x float> undef, float %63, i32 0
-  %766 = insertelement <4 x float> %765, float %65, i32 1
-  %767 = insertelement <4 x float> %766, float %67, i32 2
-  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
-  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
-  %770 = fcmp uge float 0x3FEB333340000000, %769
-  %771 = select i1 %770, float 0x3FEB333340000000, float %769
-  %772 = fmul float %8, %771
-  %773 = fmul float %13, %771
-  %774 = fmul float %18, %771
-  %775 = insertelement <4 x float> undef, float %34, i32 0
-  %776 = insertelement <4 x float> %775, float %35, i32 1
-  %777 = insertelement <4 x float> %776, float %36, i32 2
-  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
-  %779 = insertelement <4 x float> undef, float %63, i32 0
-  %780 = insertelement <4 x float> %779, float %65, i32 1
-  %781 = insertelement <4 x float> %780, float %67, i32 2
-  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
-  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
-  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
-  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
-  %786 = fmul float %772, %785
-  %787 = fmul float %773, %785
-  %788 = fmul float %774, %785
+  %tmp743 = fmul float %result.i, 5.000000e-01
+  %tmp744 = fcmp uge float 0x3FE4CCCCC0000000, %tmp743
+  %tmp745 = select i1 %tmp744, float 0x3FE4CCCCC0000000, float %tmp743
+  %tmp746 = fcmp uge float %tmp745, 0x3FC99999A0000000
+  %tmp747 = select i1 %tmp746, float 0x3FC99999A0000000, float %tmp745
+  %one.sub.a.i89 = fsub float 1.000000e+00, %tmp747
+  %one.sub.ac.i90 = fmul float %one.sub.a.i89, %tmp304
+  %mul.i91 = fmul float %result.i156, %tmp304
+  %result.i92 = fadd float %mul.i91, %one.sub.ac.i90
+  %one.sub.a.i85 = fsub float 1.000000e+00, %tmp747
+  %one.sub.ac.i86 = fmul float %one.sub.a.i85, %tmp305
+  %mul.i87 = fmul float %result.i152, %tmp305
+  %result.i88 = fadd float %mul.i87, %one.sub.ac.i86
+  %one.sub.a.i81 = fsub float 1.000000e+00, %tmp747
+  %one.sub.ac.i82 = fmul float %one.sub.a.i81, %tmp306
+  %mul.i83 = fmul float %result.i148, %tmp306
+  %result.i84 = fadd float %mul.i83, %one.sub.ac.i82
+  %one.sub.a.i77 = fsub float 1.000000e+00, %tmp747
+  %one.sub.ac.i78 = fmul float %one.sub.a.i77, %tmp307
+  %mul.i79 = fmul float %result.i144, %tmp307
+  %result.i80 = fadd float %mul.i79, %one.sub.ac.i78
+  %tmp748 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp749 = insertelement <4 x float> %tmp748, float %tmp334, i32 1
+  %tmp750 = insertelement <4 x float> %tmp749, float %tmp335, i32 2
+  %tmp751 = insertelement <4 x float> %tmp750, float 0.000000e+00, i32 3
+  %tmp752 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp753 = insertelement <4 x float> %tmp752, float %tmp65, i32 1
+  %tmp754 = insertelement <4 x float> %tmp753, float %tmp67, i32 2
+  %tmp755 = insertelement <4 x float> %tmp754, float 0.000000e+00, i32 3
+  %tmp756 = call float @llvm.r600.dot4(<4 x float> %tmp751, <4 x float> %tmp755)
+  %tmp757 = fcmp uge float 0x3FEB333340000000, %tmp756
+  %tmp758 = select i1 %tmp757, float 0x3FEB333340000000, float %tmp756
+  %tmp759 = fmul float %tmp8, %tmp758
+  %tmp760 = fmul float %tmp13, %tmp758
+  %tmp761 = fmul float %tmp18, %tmp758
+  %tmp762 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp763 = insertelement <4 x float> %tmp762, float %tmp35, i32 1
+  %tmp764 = insertelement <4 x float> %tmp763, float %tmp36, i32 2
+  %tmp765 = insertelement <4 x float> %tmp764, float 0.000000e+00, i32 3
+  %tmp766 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp767 = insertelement <4 x float> %tmp766, float %tmp65, i32 1
+  %tmp768 = insertelement <4 x float> %tmp767, float %tmp67, i32 2
+  %tmp769 = insertelement <4 x float> %tmp768, float 0.000000e+00, i32 3
+  %tmp770 = call float @llvm.r600.dot4(<4 x float> %tmp765, <4 x float> %tmp769)
+  %tmp771 = fcmp uge float 0x3FECCCCCC0000000, %tmp770
+  %tmp772 = select i1 %tmp771, float 0x3FECCCCCC0000000, float %tmp770
+  %tmp773 = fmul float %tmp759, %tmp772
+  %tmp774 = fmul float %tmp760, %tmp772
+  %tmp775 = fmul float %tmp761, %tmp772
   br label %ENDIF169
 
-ENDIF169:                                         ; preds = %ENDIF166, %IF170
-  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
-  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
-  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
-  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
-  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
-  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
-  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
-  %789 = fcmp oge float %179, 2.530000e+03
-  %790 = sext i1 %789 to i32
-  %791 = bitcast i32 %790 to float
-  %792 = fcmp olt float %179, 2.670000e+03
-  %793 = sext i1 %792 to i32
-  %794 = bitcast i32 %793 to float
-  %795 = bitcast float %791 to i32
-  %796 = bitcast float %794 to i32
-  %797 = and i32 %795, %796
-  %798 = bitcast i32 %797 to float
-  %799 = bitcast float %798 to i32
-  %800 = icmp ne i32 %799, 0
-  br i1 %800, label %IF173, label %ENDIF172
+ENDIF169:                                         ; preds = %IF170, %ENDIF166
+  %temp84.3 = phi float [ %result.i92, %IF170 ], [ %temp84.2, %ENDIF166 ]
+  %temp85.3 = phi float [ %result.i88, %IF170 ], [ %temp85.2, %ENDIF166 ]
+  %temp86.3 = phi float [ %result.i84, %IF170 ], [ %temp86.2, %ENDIF166 ]
+  %temp87.3 = phi float [ %result.i80, %IF170 ], [ %temp87.2, %ENDIF166 ]
+  %temp92.9 = phi float [ %tmp773, %IF170 ], [ %temp92.8, %ENDIF166 ]
+  %temp93.3 = phi float [ %tmp774, %IF170 ], [ %temp93.2, %ENDIF166 ]
+  %temp94.3 = phi float [ %tmp775, %IF170 ], [ %temp94.2, %ENDIF166 ]
+  %tmp776 = fcmp oge float %tmp181, 2.530000e+03
+  %tmp777 = sext i1 %tmp776 to i32
+  %tmp778 = bitcast i32 %tmp777 to float
+  %tmp779 = fcmp olt float %tmp181, 2.670000e+03
+  %tmp780 = sext i1 %tmp779 to i32
+  %tmp781 = bitcast i32 %tmp780 to float
+  %tmp782 = bitcast float %tmp778 to i32
+  %tmp783 = bitcast float %tmp781 to i32
+  %tmp784 = and i32 %tmp782, %tmp783
+  %tmp785 = bitcast i32 %tmp784 to float
+  %tmp786 = bitcast float %tmp785 to i32
+  %tmp787 = icmp ne i32 %tmp786, 0
+  br i1 %tmp787, label %IF173, label %ENDIF172
 
 IF173:                                            ; preds = %ENDIF169
-  %801 = fmul float %202, 5.000000e-01
-  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
-  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
-  %804 = fcmp uge float %803, 0x3FB99999A0000000
-  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
-  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
-  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
-  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
-  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
-  %810 = insertelement <4 x float> undef, float %329, i32 0
-  %811 = insertelement <4 x float> %810, float %330, i32 1
-  %812 = insertelement <4 x float> %811, float %331, i32 2
-  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
-  %814 = insertelement <4 x float> undef, float %63, i32 0
-  %815 = insertelement <4 x float> %814, float %65, i32 1
-  %816 = insertelement <4 x float> %815, float %67, i32 2
-  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
-  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
-  %819 = fcmp uge float 0x3FEB333340000000, %818
-  %820 = select i1 %819, float 0x3FEB333340000000, float %818
-  %821 = fmul float %8, %820
-  %822 = fmul float %13, %820
-  %823 = fmul float %18, %820
-  %824 = insertelement <4 x float> undef, float %34, i32 0
-  %825 = insertelement <4 x float> %824, float %35, i32 1
-  %826 = insertelement <4 x float> %825, float %36, i32 2
-  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
-  %828 = insertelement <4 x float> undef, float %63, i32 0
-  %829 = insertelement <4 x float> %828, float %65, i32 1
-  %830 = insertelement <4 x float> %829, float %67, i32 2
-  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
-  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
-  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
-  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
-  %835 = fmul float %821, %834
-  %836 = fmul float %822, %834
-  %837 = fmul float %823, %834
+  %tmp788 = fmul float %result.i, 5.000000e-01
+  %tmp789 = fcmp uge float 0x3FE4CCCCC0000000, %tmp788
+  %tmp790 = select i1 %tmp789, float 0x3FE4CCCCC0000000, float %tmp788
+  %tmp791 = fcmp uge float %tmp790, 0x3FB99999A0000000
+  %tmp792 = select i1 %tmp791, float 0x3FB99999A0000000, float %tmp790
+  %one.sub.a.i73 = fsub float 1.000000e+00, %tmp792
+  %one.sub.ac.i74 = fmul float %one.sub.a.i73, %tmp304
+  %mul.i75 = fmul float %result.i172, %tmp304
+  %result.i76 = fadd float %mul.i75, %one.sub.ac.i74
+  %one.sub.a.i69 = fsub float 1.000000e+00, %tmp792
+  %one.sub.ac.i70 = fmul float %one.sub.a.i69, %tmp305
+  %mul.i71 = fmul float %result.i168, %tmp305
+  %result.i72 = fadd float %mul.i71, %one.sub.ac.i70
+  %one.sub.a.i65 = fsub float 1.000000e+00, %tmp792
+  %one.sub.ac.i66 = fmul float %one.sub.a.i65, %tmp306
+  %mul.i67 = fmul float %result.i164, %tmp306
+  %result.i68 = fadd float %mul.i67, %one.sub.ac.i66
+  %one.sub.a.i61 = fsub float 1.000000e+00, %tmp792
+  %one.sub.ac.i62 = fmul float %one.sub.a.i61, %tmp307
+  %mul.i63 = fmul float %result.i160, %tmp307
+  %result.i64 = fadd float %mul.i63, %one.sub.ac.i62
+  %tmp793 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp794 = insertelement <4 x float> %tmp793, float %tmp334, i32 1
+  %tmp795 = insertelement <4 x float> %tmp794, float %tmp335, i32 2
+  %tmp796 = insertelement <4 x float> %tmp795, float 0.000000e+00, i32 3
+  %tmp797 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp798 = insertelement <4 x float> %tmp797, float %tmp65, i32 1
+  %tmp799 = insertelement <4 x float> %tmp798, float %tmp67, i32 2
+  %tmp800 = insertelement <4 x float> %tmp799, float 0.000000e+00, i32 3
+  %tmp801 = call float @llvm.r600.dot4(<4 x float> %tmp796, <4 x float> %tmp800)
+  %tmp802 = fcmp uge float 0x3FEB333340000000, %tmp801
+  %tmp803 = select i1 %tmp802, float 0x3FEB333340000000, float %tmp801
+  %tmp804 = fmul float %tmp8, %tmp803
+  %tmp805 = fmul float %tmp13, %tmp803
+  %tmp806 = fmul float %tmp18, %tmp803
+  %tmp807 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp808 = insertelement <4 x float> %tmp807, float %tmp35, i32 1
+  %tmp809 = insertelement <4 x float> %tmp808, float %tmp36, i32 2
+  %tmp810 = insertelement <4 x float> %tmp809, float 0.000000e+00, i32 3
+  %tmp811 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp812 = insertelement <4 x float> %tmp811, float %tmp65, i32 1
+  %tmp813 = insertelement <4 x float> %tmp812, float %tmp67, i32 2
+  %tmp814 = insertelement <4 x float> %tmp813, float 0.000000e+00, i32 3
+  %tmp815 = call float @llvm.r600.dot4(<4 x float> %tmp810, <4 x float> %tmp814)
+  %tmp816 = fcmp uge float 0x3FECCCCCC0000000, %tmp815
+  %tmp817 = select i1 %tmp816, float 0x3FECCCCCC0000000, float %tmp815
+  %tmp818 = fmul float %tmp804, %tmp817
+  %tmp819 = fmul float %tmp805, %tmp817
+  %tmp820 = fmul float %tmp806, %tmp817
   br label %ENDIF172
 
-ENDIF172:                                         ; preds = %ENDIF169, %IF173
-  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
-  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
-  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
-  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
-  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
-  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
-  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
-  %838 = fcmp oge float %179, 2.670000e+03
-  %839 = sext i1 %838 to i32
-  %840 = bitcast i32 %839 to float
-  %841 = bitcast float %840 to i32
-  %842 = icmp ne i32 %841, 0
-  br i1 %842, label %IF176, label %ENDIF175
+ENDIF172:                                         ; preds = %IF173, %ENDIF169
+  %temp84.4 = phi float [ %result.i76, %IF173 ], [ %temp84.3, %ENDIF169 ]
+  %temp85.4 = phi float [ %result.i72, %IF173 ], [ %temp85.3, %ENDIF169 ]
+  %temp86.4 = phi float [ %result.i68, %IF173 ], [ %temp86.3, %ENDIF169 ]
+  %temp87.4 = phi float [ %result.i64, %IF173 ], [ %temp87.3, %ENDIF169 ]
+  %temp92.10 = phi float [ %tmp818, %IF173 ], [ %temp92.9, %ENDIF169 ]
+  %temp93.4 = phi float [ %tmp819, %IF173 ], [ %temp93.3, %ENDIF169 ]
+  %temp94.4 = phi float [ %tmp820, %IF173 ], [ %temp94.3, %ENDIF169 ]
+  %tmp821 = fcmp oge float %tmp181, 2.670000e+03
+  %tmp822 = sext i1 %tmp821 to i32
+  %tmp823 = bitcast i32 %tmp822 to float
+  %tmp824 = bitcast float %tmp823 to i32
+  %tmp825 = icmp ne i32 %tmp824, 0
+  br i1 %tmp825, label %IF176, label %ENDIF175
 
 IF176:                                            ; preds = %ENDIF172
-  %843 = fmul float %202, 0x3FB99999A0000000
-  %844 = fcmp uge float 0.000000e+00, %843
-  %845 = select i1 %844, float 0.000000e+00, float %843
-  %846 = fcmp uge float %845, 0x3FD99999A0000000
-  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
-  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
-  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
-  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
-  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
-  %852 = insertelement <4 x float> undef, float %329, i32 0
-  %853 = insertelement <4 x float> %852, float %330, i32 1
-  %854 = insertelement <4 x float> %853, float %331, i32 2
-  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
-  %856 = insertelement <4 x float> undef, float %63, i32 0
-  %857 = insertelement <4 x float> %856, float %65, i32 1
-  %858 = insertelement <4 x float> %857, float %67, i32 2
-  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
-  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
-  %861 = fcmp uge float 0x3FEB333340000000, %860
-  %862 = select i1 %861, float 0x3FEB333340000000, float %860
-  %863 = fmul float %8, %862
-  %864 = fmul float %13, %862
-  %865 = fmul float %18, %862
-  %866 = insertelement <4 x float> undef, float %34, i32 0
-  %867 = insertelement <4 x float> %866, float %35, i32 1
-  %868 = insertelement <4 x float> %867, float %36, i32 2
-  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
-  %870 = insertelement <4 x float> undef, float %63, i32 0
-  %871 = insertelement <4 x float> %870, float %65, i32 1
-  %872 = insertelement <4 x float> %871, float %67, i32 2
-  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
-  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
-  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
-  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
-  %877 = fmul float %863, %876
-  %878 = fmul float %864, %876
-  %879 = fmul float %865, %876
+  %tmp826 = fmul float %result.i, 0x3FB99999A0000000
+  %tmp827 = fcmp uge float 0.000000e+00, %tmp826
+  %tmp828 = select i1 %tmp827, float 0.000000e+00, float %tmp826
+  %tmp829 = fcmp uge float %tmp828, 0x3FD99999A0000000
+  %tmp830 = select i1 %tmp829, float 0x3FD99999A0000000, float %tmp828
+  %one.sub.a.i57 = fsub float 1.000000e+00, %tmp830
+  %one.sub.ac.i58 = fmul float %one.sub.a.i57, %tmp304
+  %mul.i59 = fmul float %result.i172, %tmp304
+  %result.i60 = fadd float %mul.i59, %one.sub.ac.i58
+  %one.sub.a.i53 = fsub float 1.000000e+00, %tmp830
+  %one.sub.ac.i54 = fmul float %one.sub.a.i53, %tmp305
+  %mul.i55 = fmul float %result.i168, %tmp305
+  %result.i56 = fadd float %mul.i55, %one.sub.ac.i54
+  %one.sub.a.i49 = fsub float 1.000000e+00, %tmp830
+  %one.sub.ac.i50 = fmul float %one.sub.a.i49, %tmp306
+  %mul.i51 = fmul float %result.i164, %tmp306
+  %result.i52 = fadd float %mul.i51, %one.sub.ac.i50
+  %one.sub.a.i45 = fsub float 1.000000e+00, %tmp830
+  %one.sub.ac.i46 = fmul float %one.sub.a.i45, %tmp307
+  %mul.i47 = fmul float %result.i160, %tmp307
+  %result.i48 = fadd float %mul.i47, %one.sub.ac.i46
+  %tmp831 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp832 = insertelement <4 x float> %tmp831, float %tmp334, i32 1
+  %tmp833 = insertelement <4 x float> %tmp832, float %tmp335, i32 2
+  %tmp834 = insertelement <4 x float> %tmp833, float 0.000000e+00, i32 3
+  %tmp835 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp836 = insertelement <4 x float> %tmp835, float %tmp65, i32 1
+  %tmp837 = insertelement <4 x float> %tmp836, float %tmp67, i32 2
+  %tmp838 = insertelement <4 x float> %tmp837, float 0.000000e+00, i32 3
+  %tmp839 = call float @llvm.r600.dot4(<4 x float> %tmp834, <4 x float> %tmp838)
+  %tmp840 = fcmp uge float 0x3FEB333340000000, %tmp839
+  %tmp841 = select i1 %tmp840, float 0x3FEB333340000000, float %tmp839
+  %tmp842 = fmul float %tmp8, %tmp841
+  %tmp843 = fmul float %tmp13, %tmp841
+  %tmp844 = fmul float %tmp18, %tmp841
+  %tmp845 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp846 = insertelement <4 x float> %tmp845, float %tmp35, i32 1
+  %tmp847 = insertelement <4 x float> %tmp846, float %tmp36, i32 2
+  %tmp848 = insertelement <4 x float> %tmp847, float 0.000000e+00, i32 3
+  %tmp849 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp850 = insertelement <4 x float> %tmp849, float %tmp65, i32 1
+  %tmp851 = insertelement <4 x float> %tmp850, float %tmp67, i32 2
+  %tmp852 = insertelement <4 x float> %tmp851, float 0.000000e+00, i32 3
+  %tmp853 = call float @llvm.r600.dot4(<4 x float> %tmp848, <4 x float> %tmp852)
+  %tmp854 = fcmp uge float 0x3FECCCCCC0000000, %tmp853
+  %tmp855 = select i1 %tmp854, float 0x3FECCCCCC0000000, float %tmp853
+  %tmp856 = fmul float %tmp842, %tmp855
+  %tmp857 = fmul float %tmp843, %tmp855
+  %tmp858 = fmul float %tmp844, %tmp855
   br label %ENDIF175
 
-ENDIF175:                                         ; preds = %ENDIF172, %IF176
-  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
-  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
-  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
-  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
-  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
-  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
-  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
-  %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %881 = extractelement <4 x float> %880, i32 0
-  %882 = fcmp olt float %881, %179
-  %883 = sext i1 %882 to i32
-  %884 = bitcast i32 %883 to float
-  %885 = bitcast float %884 to i32
-  %886 = icmp ne i32 %885, 0
-  br i1 %886, label %IF179, label %ENDIF178
+ENDIF175:                                         ; preds = %IF176, %ENDIF172
+  %temp84.5 = phi float [ %result.i60, %IF176 ], [ %temp84.4, %ENDIF172 ]
+  %temp85.5 = phi float [ %result.i56, %IF176 ], [ %temp85.4, %ENDIF172 ]
+  %temp86.5 = phi float [ %result.i52, %IF176 ], [ %temp86.4, %ENDIF172 ]
+  %temp87.5 = phi float [ %result.i48, %IF176 ], [ %temp87.4, %ENDIF172 ]
+  %temp92.11 = phi float [ %tmp856, %IF176 ], [ %temp92.10, %ENDIF172 ]
+  %temp93.5 = phi float [ %tmp857, %IF176 ], [ %temp93.4, %ENDIF172 ]
+  %temp94.5 = phi float [ %tmp858, %IF176 ], [ %temp94.4, %ENDIF172 ]
+  %tmp859 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp860 = extractelement <4 x float> %tmp859, i32 0
+  %tmp861 = fcmp olt float %tmp860, %tmp181
+  %tmp862 = sext i1 %tmp861 to i32
+  %tmp863 = bitcast i32 %tmp862 to float
+  %tmp864 = bitcast float %tmp863 to i32
+  %tmp865 = icmp ne i32 %tmp864, 0
+  br i1 %tmp865, label %IF179, label %ENDIF178
 
 IF179:                                            ; preds = %ENDIF175
-  %887 = fadd float %202, 1.000000e+00
-  %888 = fadd float %202, 1.000000e+00
-  %889 = fadd float %202, 1.000000e+00
-  %890 = insertelement <4 x float> undef, float %43, i32 0
-  %891 = insertelement <4 x float> %890, float %44, i32 1
-  %892 = insertelement <4 x float> %891, float %45, i32 2
-  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
-  %894 = insertelement <4 x float> undef, float %43, i32 0
-  %895 = insertelement <4 x float> %894, float %44, i32 1
-  %896 = insertelement <4 x float> %895, float %45, i32 2
-  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
-  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
-  %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
-  %900 = fmul float %45, %899
-  %901 = call float @fabs(float %900)
-  %902 = fmul float %176, 0x3FECCCCCC0000000
-  %903 = fadd float %902, %901
-  %904 = fadd float %903, 0xBFEFAE1480000000
-  %905 = fmul float %904, 0xC043FFFE20000000
-  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
-  %907 = fmul float 2.000000e+00, %906
-  %908 = fsub float -0.000000e+00, %907
-  %909 = fadd float 3.000000e+00, %908
-  %910 = fmul float %906, %909
-  %911 = fmul float %906, %910
-  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
-  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
-  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
-  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
-  %916 = fmul float %202, 5.000000e-01
-  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
-  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
-  %919 = fcmp uge float %918, 0x3FE3333340000000
-  %920 = select i1 %919, float 0x3FE3333340000000, float %918
-  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
-  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
-  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
-  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
-  %925 = insertelement <4 x float> undef, float %329, i32 0
-  %926 = insertelement <4 x float> %925, float %330, i32 1
-  %927 = insertelement <4 x float> %926, float %331, i32 2
-  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
-  %929 = insertelement <4 x float> undef, float %63, i32 0
-  %930 = insertelement <4 x float> %929, float %65, i32 1
-  %931 = insertelement <4 x float> %930, float %67, i32 2
-  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
-  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
-  %934 = fcmp uge float 0x3FE99999A0000000, %933
-  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
-  %936 = fmul float %8, %935
-  %937 = fmul float %13, %935
-  %938 = fmul float %18, %935
-  %939 = insertelement <4 x float> undef, float %34, i32 0
-  %940 = insertelement <4 x float> %939, float %35, i32 1
-  %941 = insertelement <4 x float> %940, float %36, i32 2
-  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
-  %943 = insertelement <4 x float> undef, float %63, i32 0
-  %944 = insertelement <4 x float> %943, float %65, i32 1
-  %945 = insertelement <4 x float> %944, float %67, i32 2
-  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
-  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
-  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
-  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
-  %950 = fmul float %936, %949
-  %951 = fmul float %937, %949
-  %952 = fmul float %938, %949
+  %tmp866 = fadd float %result.i, 1.000000e+00
+  %tmp867 = fadd float %result.i, 1.000000e+00
+  %tmp868 = fadd float %result.i, 1.000000e+00
+  %tmp869 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp870 = insertelement <4 x float> %tmp869, float %tmp44, i32 1
+  %tmp871 = insertelement <4 x float> %tmp870, float %tmp45, i32 2
+  %tmp872 = insertelement <4 x float> %tmp871, float 0.000000e+00, i32 3
+  %tmp873 = insertelement <4 x float> undef, float %tmp43, i32 0
+  %tmp874 = insertelement <4 x float> %tmp873, float %tmp44, i32 1
+  %tmp875 = insertelement <4 x float> %tmp874, float %tmp45, i32 2
+  %tmp876 = insertelement <4 x float> %tmp875, float 0.000000e+00, i32 3
+  %tmp877 = call float @llvm.r600.dot4(<4 x float> %tmp872, <4 x float> %tmp876)
+  %tmp878 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp877)
+  %tmp879 = fmul float %tmp45, %tmp878
+  %tmp880 = call float @llvm.fabs.f32(float %tmp879)
+  %tmp881 = fmul float %tmp178, 0x3FECCCCCC0000000
+  %tmp882 = fadd float %tmp881, %tmp880
+  %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000
+  %tmp884 = fmul float %tmp883, 0xC043FFFE20000000
+  %tmp885 = call float @llvm.AMDGPU.clamp.f32(float %tmp884, float 0.000000e+00, float 1.000000e+00)
+  %tmp886 = fmul float 2.000000e+00, %tmp885
+  %tmp887 = fsub float -0.000000e+00, %tmp886
+  %tmp888 = fadd float 3.000000e+00, %tmp887
+  %tmp889 = fmul float %tmp885, %tmp888
+  %tmp890 = fmul float %tmp885, %tmp889
+  %one.sub.a.i41 = fsub float 1.000000e+00, %tmp890
+  %one.sub.ac.i42 = fmul float %one.sub.a.i41, %tmp866
+  %mul.i43 = fmul float %temp84.5, %tmp866
+  %result.i44 = fadd float %mul.i43, %one.sub.ac.i42
+  %one.sub.a.i37 = fsub float 1.000000e+00, %tmp890
+  %one.sub.ac.i38 = fmul float %one.sub.a.i37, %tmp867
+  %mul.i39 = fmul float %temp85.5, %tmp867
+  %result.i40 = fadd float %mul.i39, %one.sub.ac.i38
+  %one.sub.a.i33 = fsub float 1.000000e+00, %tmp890
+  %one.sub.ac.i34 = fmul float %one.sub.a.i33, %tmp868
+  %mul.i35 = fmul float %temp86.5, %tmp868
+  %result.i36 = fadd float %mul.i35, %one.sub.ac.i34
+  %one.sub.a.i29 = fsub float 1.000000e+00, %tmp890
+  %one.sub.ac.i30 = fmul float %one.sub.a.i29, 0.000000e+00
+  %mul.i31 = fmul float %temp87.5, 0.000000e+00
+  %result.i32 = fadd float %mul.i31, %one.sub.ac.i30
+  %tmp891 = fmul float %result.i, 5.000000e-01
+  %tmp892 = fcmp uge float 0x3FE4CCCCC0000000, %tmp891
+  %tmp893 = select i1 %tmp892, float 0x3FE4CCCCC0000000, float %tmp891
+  %tmp894 = fcmp uge float %tmp893, 0x3FE3333340000000
+  %tmp895 = select i1 %tmp894, float 0x3FE3333340000000, float %tmp893
+  %one.sub.a.i25 = fsub float 1.000000e+00, %tmp895
+  %one.sub.ac.i26 = fmul float %one.sub.a.i25, %temp84.5
+  %mul.i27 = fmul float %result.i44, %temp84.5
+  %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
+  %one.sub.a.i21 = fsub float 1.000000e+00, %tmp895
+  %one.sub.ac.i22 = fmul float %one.sub.a.i21, %temp85.5
+  %mul.i23 = fmul float %result.i40, %temp85.5
+  %result.i24 = fadd float %mul.i23, %one.sub.ac.i22
+  %one.sub.a.i17 = fsub float 1.000000e+00, %tmp895
+  %one.sub.ac.i18 = fmul float %one.sub.a.i17, %temp86.5
+  %mul.i19 = fmul float %result.i36, %temp86.5
+  %result.i20 = fadd float %mul.i19, %one.sub.ac.i18
+  %one.sub.a.i13 = fsub float 1.000000e+00, %tmp895
+  %one.sub.ac.i14 = fmul float %one.sub.a.i13, %temp87.5
+  %mul.i15 = fmul float %result.i32, %temp87.5
+  %result.i16 = fadd float %mul.i15, %one.sub.ac.i14
+  %tmp896 = insertelement <4 x float> undef, float %tmp333, i32 0
+  %tmp897 = insertelement <4 x float> %tmp896, float %tmp334, i32 1
+  %tmp898 = insertelement <4 x float> %tmp897, float %tmp335, i32 2
+  %tmp899 = insertelement <4 x float> %tmp898, float 0.000000e+00, i32 3
+  %tmp900 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp901 = insertelement <4 x float> %tmp900, float %tmp65, i32 1
+  %tmp902 = insertelement <4 x float> %tmp901, float %tmp67, i32 2
+  %tmp903 = insertelement <4 x float> %tmp902, float 0.000000e+00, i32 3
+  %tmp904 = call float @llvm.r600.dot4(<4 x float> %tmp899, <4 x float> %tmp903)
+  %tmp905 = fcmp uge float 0x3FE99999A0000000, %tmp904
+  %tmp906 = select i1 %tmp905, float 0x3FE99999A0000000, float %tmp904
+  %tmp907 = fmul float %tmp8, %tmp906
+  %tmp908 = fmul float %tmp13, %tmp906
+  %tmp909 = fmul float %tmp18, %tmp906
+  %tmp910 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp911 = insertelement <4 x float> %tmp910, float %tmp35, i32 1
+  %tmp912 = insertelement <4 x float> %tmp911, float %tmp36, i32 2
+  %tmp913 = insertelement <4 x float> %tmp912, float 0.000000e+00, i32 3
+  %tmp914 = insertelement <4 x float> undef, float %tmp63, i32 0
+  %tmp915 = insertelement <4 x float> %tmp914, float %tmp65, i32 1
+  %tmp916 = insertelement <4 x float> %tmp915, float %tmp67, i32 2
+  %tmp917 = insertelement <4 x float> %tmp916, float 0.000000e+00, i32 3
+  %tmp918 = call float @llvm.r600.dot4(<4 x float> %tmp913, <4 x float> %tmp917)
+  %tmp919 = fcmp uge float 0x3FECCCCCC0000000, %tmp918
+  %tmp920 = select i1 %tmp919, float 0x3FECCCCCC0000000, float %tmp918
+  %tmp921 = fmul float %tmp907, %tmp920
+  %tmp922 = fmul float %tmp908, %tmp920
+  %tmp923 = fmul float %tmp909, %tmp920
   br label %ENDIF178
 
-ENDIF178:                                         ; preds = %ENDIF175, %IF179
-  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
-  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
-  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
-  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
-  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
-  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
-  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
-  %953 = fmul float %55, %temp92.12
-  %954 = fmul float %57, %temp93.6
-  %955 = fmul float %59, %temp94.6
-  %956 = fmul float %61, 0.000000e+00
-  %957 = fmul float %temp84.6, %953
-  %958 = fmul float %temp85.6, %954
-  %959 = fmul float %temp86.6, %955
-  %960 = fmul float %temp87.6, %956
-  %961 = fmul float %2, -2.000000e+00
-  %962 = fadd float %961, 1.000000e+00
-  %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
-  %964 = extractelement <4 x float> %963, i32 2
-  %965 = fsub float -0.000000e+00, %964
-  %966 = fadd float %962, %965
-  %967 = fdiv float 1.000000e+00, %966
-  %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
-  %969 = extractelement <4 x float> %968, i32 2
-  %970 = fmul float %969, %967
-  %971 = fsub float -0.000000e+00, %53
-  %972 = fmul float %971, %53
-  %973 = fmul float %972, %970
-  %974 = fmul float %973, %970
-  %975 = fmul float %974, 0x3FF7154760000000
-  %976 = call float @llvm.AMDIL.exp.(float %975)
-  %977 = fcmp oeq float %53, 1.000000e+00
-  %978 = sext i1 %977 to i32
-  %979 = bitcast i32 %978 to float
-  %980 = bitcast float %979 to i32
-  %981 = icmp ne i32 %980, 0
-  %.184 = select i1 %981, float 1.000000e+00, float %976
-  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
-  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
-  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
-  %985 = insertelement <4 x float> undef, float %982, i32 0
-  %986 = insertelement <4 x float> %985, float %983, i32 1
-  %987 = insertelement <4 x float> %986, float %984, i32 2
-  %988 = insertelement <4 x float> %987, float %960, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+ENDIF178:                                         ; preds = %IF179, %ENDIF175
+  %temp84.6 = phi float [ %result.i28, %IF179 ], [ %temp84.5, %ENDIF175 ]
+  %temp85.6 = phi float [ %result.i24, %IF179 ], [ %temp85.5, %ENDIF175 ]
+  %temp86.6 = phi float [ %result.i20, %IF179 ], [ %temp86.5, %ENDIF175 ]
+  %temp87.6 = phi float [ %result.i16, %IF179 ], [ %temp87.5, %ENDIF175 ]
+  %temp92.12 = phi float [ %tmp921, %IF179 ], [ %temp92.11, %ENDIF175 ]
+  %temp93.6 = phi float [ %tmp922, %IF179 ], [ %temp93.5, %ENDIF175 ]
+  %temp94.6 = phi float [ %tmp923, %IF179 ], [ %temp94.5, %ENDIF175 ]
+  %tmp924 = fmul float %tmp55, %temp92.12
+  %tmp925 = fmul float %tmp57, %temp93.6
+  %tmp926 = fmul float %tmp59, %temp94.6
+  %tmp927 = fmul float %tmp61, 0.000000e+00
+  %tmp928 = fmul float %temp84.6, %tmp924
+  %tmp929 = fmul float %temp85.6, %tmp925
+  %tmp930 = fmul float %temp86.6, %tmp926
+  %tmp931 = fmul float %temp87.6, %tmp927
+  %tmp932 = fmul float %tmp2, -2.000000e+00
+  %tmp933 = fadd float %tmp932, 1.000000e+00
+  %tmp934 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %tmp935 = extractelement <4 x float> %tmp934, i32 2
+  %tmp936 = fsub float -0.000000e+00, %tmp935
+  %tmp937 = fadd float %tmp933, %tmp936
+  %tmp938 = fdiv float 1.000000e+00, %tmp937
+  %tmp939 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %tmp940 = extractelement <4 x float> %tmp939, i32 2
+  %tmp941 = fmul float %tmp940, %tmp938
+  %tmp942 = fsub float -0.000000e+00, %tmp53
+  %tmp943 = fmul float %tmp942, %tmp53
+  %tmp944 = fmul float %tmp943, %tmp941
+  %tmp945 = fmul float %tmp944, %tmp941
+  %tmp946 = fmul float %tmp945, 0x3FF7154760000000
+  %tmp947 = call float @llvm.exp2.f32(float %tmp946)
+  %tmp948 = fcmp oeq float %tmp53, 1.000000e+00
+  %tmp949 = sext i1 %tmp948 to i32
+  %tmp950 = bitcast i32 %tmp949 to float
+  %tmp951 = bitcast float %tmp950 to i32
+  %tmp952 = icmp ne i32 %tmp951, 0
+  %.184 = select i1 %tmp952, float 1.000000e+00, float %tmp947
+  %one.sub.a.i9 = fsub float 1.000000e+00, %.184
+  %one.sub.ac.i10 = fmul float %one.sub.a.i9, %tmp47
+  %mul.i11 = fmul float %tmp928, %tmp47
+  %result.i12 = fadd float %mul.i11, %one.sub.ac.i10
+  %one.sub.a.i5 = fsub float 1.000000e+00, %.184
+  %one.sub.ac.i6 = fmul float %one.sub.a.i5, %tmp49
+  %mul.i7 = fmul float %tmp929, %tmp49
+  %result.i8 = fadd float %mul.i7, %one.sub.ac.i6
+  %one.sub.a.i1 = fsub float 1.000000e+00, %.184
+  %one.sub.ac.i2 = fmul float %one.sub.a.i1, %tmp51
+  %mul.i3 = fmul float %tmp930, %tmp51
+  %result.i4 = fadd float %mul.i3, %one.sub.ac.i2
+  %tmp953 = insertelement <4 x float> undef, float %result.i12, i32 0
+  %tmp954 = insertelement <4 x float> %tmp953, float %result.i8, i32 1
+  %tmp955 = insertelement <4 x float> %tmp954, float %result.i4, i32 2
+  %tmp956 = insertelement <4 x float> %tmp955, float %tmp931, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp956, i32 0, i32 0)
   ret void
 }
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
 
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+; Function Attrs: nounwind readonly
+declare float @llvm.fabs.f32(float) #1
 
-; Function Attrs: readonly
-declare float @fabs(float) #2
+; Function Attrs: nounwind readnone
+declare float @llvm.exp2.f32(float) #0
 
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/bitcast.ll b/test/CodeGen/AMDGPU/bitcast.ll
index fd56d956bf31..87ef5978ebfc 100644
--- a/test/CodeGen/AMDGPU/bitcast.ll
+++ b/test/CodeGen/AMDGPU/bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
@@ -7,7 +7,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
 ; SI: s_endpgm
-define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
@@ -76,4 +76,34 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" }
+; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
+define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x i64> %value to <2 x double>
+  br label %end
+
+end:
+  %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x double> %phi, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
+define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x double> %value to <2 x i64>
+  br label %end
+
+end:
+  %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x i64> %phi, <2 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
new file mode 100644
index 000000000000..150e3430a5e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -0,0 +1,158 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that materialization constants that are the bit reversed of
+; inline immediates are replaced with bfrev of the inline immediate to
+; save code size.
+
+; GCN-LABEL: {{^}}materialize_0_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_0_i32(i32 addrspace(1)* %out) {
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_0_i64:
+; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_0_i64(i64 addrspace(1)* %out) {
+  store i64 0, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_neg1_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+  store i32 -1, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_neg1_i64:
+; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+  store i64 -1, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_signbit_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+  store i32 -2147483648, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_signbit_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+  store i64  -9223372036854775808, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+  store i32 268435455, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg16_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+  store i64  1152921504606846975, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+  store i32 -134217729, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg17_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+  store i64 -576460752303423489, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_64_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+  store i32 33554432, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_64_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+  store i64 144115188075855872, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_65_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+  store i32 -2113929216, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_65_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+  store i64 -9079256848778919936, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_3_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+  store i32 -1073741824, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_3_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+  store i64 -4611686018427387904, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+  store i32 508, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_1.0_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+  store i64 508, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 0ef7d5184c1f..62e7904f4382 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -11,8 +11,6 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
 
-declare i32 @llvm.AMDGPU.brev(i32) #1
-
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32
 define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
@@ -103,13 +101,5 @@ define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrsp
   ret void
 }
 
-; FUNC-LABEL: {{^}}legacy_s_brev_i32:
-; SI: s_brev_b32
-define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1
-  store i32 %brev, i32 addrspace(1)* %out
-  ret void
-}
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/branch-uniformity.ll b/test/CodeGen/AMDGPU/branch-uniformity.ll
new file mode 100644
index 000000000000..d1a1f93f0210
--- /dev/null
+++ b/test/CodeGen/AMDGPU/branch-uniformity.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck %s
+
+; The branch instruction in LOOP49 has a uniform condition, but PHI instructions
+; introduced by the structurizecfg pass previously caused a false divergence
+; which ended up in an assertion (or incorrect code) because
+; SIAnnotateControlFlow and structurizecfg had different ideas about which
+; branches are uniform.
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: ; %LOOP49
+; CHECK: v_cmp_ne_i32_e32 vcc,
+; CHECK: s_cbranch_vccnz
+; CHECK: ; %ENDIF53
+define amdgpu_vs float @main(i32 %in) {
+main_body:
+  %cmp = mul i32 %in, 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDLOOP48, %main_body
+  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %ENDLOOP48 ]
+  %v.LOOP = phi i32 [ 0, %main_body ], [ %v.ENDLOOP48, %ENDLOOP48 ]
+  %tmp7 = icmp slt i32 %cmp, %counter
+  br i1 %tmp7, label %IF, label %LOOP49
+
+IF:                                               ; preds = %LOOP
+  %r = bitcast i32 %v.LOOP to float
+  ret float %r
+
+LOOP49:                                           ; preds = %LOOP
+  %tmp8 = icmp ne i32 %counter, 0
+  br i1 %tmp8, label %ENDLOOP48, label %ENDIF53
+
+ENDLOOP48:                                        ; preds = %ENDIF53, %LOOP49
+  %v.ENDLOOP48 = phi i32 [ %v.LOOP, %LOOP49 ], [ %v.ENDIF53, %ENDIF53 ]
+  %counter.next = add i32 %counter, 1
+  br label %LOOP
+
+ENDIF53:                                          ; preds = %LOOP49
+  %v.ENDIF53 = add i32 %v.LOOP, %counter
+  br label %ENDLOOP48
+}
diff --git a/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
new file mode 100644
index 000000000000..990671102757
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+target triple = "amdgcn--"
+
+; CHECK-LABEL: {{^}}main:
+;
+; Test for compilation only. This generated an invalid machine instruction
+; by trying to commute the operands of a V_CMP_EQ_i32_e32 instruction, both
+; of which were in SGPRs.
+define amdgpu_vs float @main(i32 %v) {
+main_body:
+  %d1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 960)
+  %d2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 976)
+  br i1 undef, label %ENDIF56, label %IF57
+
+IF57:                                             ; preds = %ENDIF
+  %v.1 = mul i32 %v, 2
+  br label %ENDIF56
+
+ENDIF56:                                          ; preds = %IF57, %ENDIF
+  %v.2 = phi i32 [ %v, %main_body ], [ %v.1, %IF57 ]
+  %d1.i = bitcast float %d1 to i32
+  %cc1 = icmp eq i32 %d1.i, 0
+  br i1 %cc1, label %ENDIF59, label %IF60
+
+IF60:                                             ; preds = %ENDIF56
+  %v.3 = mul i32 %v.2, 2
+  br label %ENDIF59
+
+ENDIF59:                                          ; preds = %IF60, %ENDIF56
+  %v.4 = phi i32 [ %v.2, %ENDIF56 ], [ %v.3, %IF60 ]
+  %d2.i = bitcast float %d2 to i32
+  %cc2 = icmp eq i32 %d2.i, 0
+  br i1 %cc2, label %ENDIF62, label %IF63
+
+IF63:                                             ; preds = %ENDIF59
+  unreachable
+
+ENDIF62:                                          ; preds = %ENDIF59
+  %r = bitcast i32 %v.4 to float
+  ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
index e769fd11c282..a3e986d367e1 100644
--- a/test/CodeGen/AMDGPU/call.ll
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -1,8 +1,10 @@
-; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported call to function external_function in test_call_external
+; CHECK: in function test_call_external{{.*}}: unsupported call to function external_function
+; CHECK: in function test_call{{.*}}: unsupported call to function defined_function
+; CHECK: in function test_tail_call{{.*}}: unsupported call to function defined_function
 
 
 declare i32 @external_function(i32) nounwind
@@ -31,3 +33,13 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %c = tail call i32 @defined_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/call_fs.ll b/test/CodeGen/AMDGPU/call_fs.ll
index 87bebbc49d52..a5a2d28ff716 100644
--- a/test/CodeGen/AMDGPU/call_fs.ll
+++ b/test/CodeGen/AMDGPU/call_fs.ll
@@ -10,8 +10,6 @@
 ; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
 
 
-define void @call_fs() #0 {
+define amdgpu_vs void @call_fs() {
   ret void
 }
-
-attributes #0 = { "ShaderType"="1" } ; Vertex Shader
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll
new file mode 100644
index 000000000000..161c46b486eb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -0,0 +1,166 @@
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}stored_fi_to_lds:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
+define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+  %tmp = alloca float
+  store float 4.0, float *%tmp
+  store float* %tmp, float* addrspace(3)* %ptr
+  ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+
+; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
+
+; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
+define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  store float 4.0, float* %tmp0
+  store float 4.0, float* %tmp1
+  store volatile float* %tmp0, float* addrspace(3)* %ptr
+  store volatile float* %tmp1, float* addrspace(3)* %ptr
+  ret void
+}
+
+; Same frame index is used multiple times in the store
+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[K]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword [[ZERO]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+define void @stored_fi_to_self() #0 {
+  %tmp = alloca i32*
+
+  ; Avoid optimizing everything out
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+  %bitcast = bitcast i32** %tmp to i32*
+  store volatile i32* %bitcast, i32** %tmp
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_self_offset:
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
+; GCN: buffer_store_dword [[K0]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
+; GCN: buffer_store_dword [[K1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+define void @stored_fi_to_self_offset() #0 {
+  %tmp0 = alloca [512 x i32]
+  %tmp1 = alloca i32*
+
+  ; Avoid optimizing everything out
+  %tmp0.cast = bitcast [512 x i32]* %tmp0 to i32*
+  store volatile i32 32, i32* %tmp0.cast
+
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp1
+
+  %bitcast = bitcast i32** %tmp1 to i32*
+  store volatile i32* %bitcast, i32** %tmp1
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+define void @stored_fi_to_fi() #0 {
+  %tmp0 = alloca i32*
+  %tmp1 = alloca i32*
+  %tmp2 = alloca i32*
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
+  store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
+  store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
+
+  %bitcast1 = bitcast i32** %tmp1 to i32*
+  %bitcast2 = bitcast i32** %tmp2 to i32* ;  at offset 8
+
+  store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
+  store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+  %tmp = alloca float
+  store float 0.0, float *%tmp
+  store float* %tmp, float* addrspace(1)* %ptr
+  ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  %tmp2 = alloca float
+  store volatile float 0.0, float *%tmp0
+  store volatile float 0.0, float *%tmp1
+  store volatile float 0.0, float *%tmp2
+  store volatile float* %tmp1, float* addrspace(1)* %ptr
+  store volatile float* %tmp2, float* addrspace(1)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
+; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc
+; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56
+; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]
+; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+  %tmp0 = alloca [4096 x i32]
+  %tmp1 = alloca [4096 x i32]
+  %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
+  store volatile i32 0, i32* %gep0.tmp0
+  %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095
+  store volatile i32 999, i32* %gep1.tmp0
+  %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14
+  store i32* %gep0.tmp1, i32* addrspace(1)* %ptr
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
index c7b8c4037316..34e6669434f1 100644
--- a/test/CodeGen/AMDGPU/cayman-loop-bug.ll
+++ b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -8,25 +8,29 @@
 ; CHECK-NOT: ALU_PUSH_BEFORE
 ; CHECK: END_LOOP
 ; CHECK: END_LOOP
-define void @main (<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @main (<4 x float> inreg %reg0) {
 entry:
   br label %outer_loop
+
 outer_loop:
   %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
   %cond = icmp eq i32 %cnt, 16
   br i1 %cond, label %outer_loop_body, label %exit
+
 outer_loop_body:
   %cnt_incr = add i32 %cnt, 1
   br label %inner_loop
+
 inner_loop:
   %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
-  %cond2 = icmp eq i32 %cnt2, 16
-  br i1 %cond, label %inner_loop_body, label %outer_loop
+  %n = load volatile i32, i32 addrspace(1)* undef
+  %cond2 = icmp slt i32 %cnt2, %n
+  br i1 %cond2, label %inner_loop_body, label %outer_loop
+
 inner_loop_body:
   %cnt2_incr = add i32 %cnt2, 1
   br label %inner_loop
+
 exit:
   ret void
 }
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
new file mode 100644
index 000000000000..759c48b3a9cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -0,0 +1,121 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
+
+; GCN-LABEL: {{^}}test_loop:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+; GCN: s_endpgm
+define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  %cmp = icmp eq i32 %n, -1
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br label %for.body
+}
+
+; GCN-LABEL: @loop_const_true
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 true, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_false:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+; XXX - Should there be an S_ENDPGM?
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 false, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_undef:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+; XXX - Should there be an s_endpgm?
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 undef, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_arg_0:
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: v_cmp_eq_i32_e32 vcc, 1,
+
+; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc
+; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccnz [[LOOPBB]]
+; GCN-NEXT: ; BB#2
+; GCN-NEXT: s_endpgm
+define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+entry:
+  br label %for.body
+
+for.exit:
+  ret void
+
+for.body:
+  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %tmp = add i32 %indvar, 32
+  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %add = fadd float %vecload, 1.0
+  store float %add, float addrspace(3)* %arrayidx, align 8
+  %inc = add i32 %indvar, 1
+  br i1 %cond, label %for.body, label %for.exit
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 1c5bed3b905f..82f88a079307 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -42,7 +42,7 @@ done:
 ; OPT: br label
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
-; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 698494265a7d..916d667ec492 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,11 +1,9 @@
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
@@ -15,11 +13,12 @@ declare i32 @llvm.r600.read.tidig.x() #0
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -41,14 +40,15 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -67,14 +67,15 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -93,14 +94,15 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -127,14 +129,15 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %add.arg = add i32 %arg, 8
   %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -163,14 +166,15 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: {{^}}BB5_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %add.arg = add i32 %arg, 8
   %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -190,18 +194,17 @@ done:
 }
 
 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
-; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
-; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
 ; GCN: s_and_saveexec_b64
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^}}BB6_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
+define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -230,11 +233,12 @@ attributes #1 = { nounwind }
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -258,11 +262,12 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -290,11 +295,12 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -321,11 +327,12 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -351,11 +358,12 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -380,11 +388,12 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -417,11 +426,12 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
-  %tmp0 = icmp eq i32 %cond, 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
@@ -436,3 +446,35 @@ endif:
 done:
   ret void
 }
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; OPT-LABEL: @sink_ds_address(
+; OPT: ptrtoint %struct.foo addrspace(3)* %ptr to i64
+
+; GCN-LABEL: {{^}}sink_ds_address:
+; GCN: s_load_dword [[SREG1:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
+define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  br label %bb32
+
+bb32:
+  %a = load float, float addrspace(3)* %x, align 4
+  %b = load float, float addrspace(3)* %y, align 4
+  %cmp = fcmp one float %a, %b
+  br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+  unreachable
+
+bb34:
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
new file mode 100644
index 000000000000..33daf0292ae1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -0,0 +1,301 @@
+; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -codegenprepare < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; This particular case will actually be worse in terms of code size
+; from sinking into both.
+
+; OPT-LABEL: @sink_ubfe_i32(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i32 %arg1, 8
+; OPT-NEXT: %val0 = and i32 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = lshr i32 %arg1, 8
+; OPT-NEXT: %val1 = and i32 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+
+; GCN-LABEL: {{^}}sink_ubfe_i32:
+; GCN-NOT: lshr
+; GCN: s_cbranch_vccnz
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008
+; GCN: BB0_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70008
+
+; GCN: BB0_3:
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+entry:
+  %shr = lshr i32 %arg1, 8
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i32 %shr, 255
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i32 %shr, 127
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i32 %phi, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @sink_sbfe_i32(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = ashr i32 %arg1, 8
+; OPT-NEXT: %val0 = and i32 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = ashr i32 %arg1, 8
+; OPT-NEXT: %val1 = and i32 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_sbfe_i32:
+define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+entry:
+  %shr = ashr i32 %arg1, 8
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i32 %shr, 255
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i32 %shr, 127
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i32 %phi, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; OPT-LABEL: @sink_ubfe_i16(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i16 %arg1, 4
+; OPT-NEXT: %val0 = and i16 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = lshr i16 %arg1, 4
+; OPT-NEXT: %val1 = and i16 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+
+; GCN-LABEL: {{^}}sink_ubfe_i16:
+; GCN-NOT: lshr
+; GCN: s_cbranch_vccnz
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
+; GCN: BB2_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
+
+; GCN: BB2_3:
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+entry:
+  %shr = lshr i16 %arg1, 4
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i16 %shr, 255
+  store volatile i16 0, i16 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i16 %shr, 127
+  store volatile i16 0, i16 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i16 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i16 %phi, i16 addrspace(1)* %out
+  ret void
+}
+
+; We don't really want to sink this one since it isn't reducible to a
+; 32-bit BFE on one half of the integer.
+
+; OPT-LABEL: @sink_ubfe_i64_span_midpoint(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 30
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 30
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_span_midpoint:
+; GCN: s_cbranch_vccnz BB3_2
+
+; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0xff
+
+; GCN: BB3_2:
+; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7f
+
+; GCN: BB3_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+  %shr = lshr i64 %arg1, 30
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i64 %shr, 255
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i64 %shr, 127
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i64 %phi, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @sink_ubfe_i64_low32(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 15
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 15
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_low32:
+
+; GCN: s_cbranch_vccnz BB4_2
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000f
+
+; GCN: BB4_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7000f
+
+; GCN: BB4_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+  %shr = lshr i64 %arg1, 15
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i64 %shr, 255
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i64 %shr, 127
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i64 %phi, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @sink_ubfe_i64_high32(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 35
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 35
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_high32:
+; GCN: s_cbranch_vccnz BB5_2
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80003
+
+; GCN: BB5_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70003
+
+; GCN: BB5_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+  %shr = lshr i64 %arg1, 35
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %val0 = and i64 %shr, 255
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+bb1:
+  %val1 = and i64 %shr, 127
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %ret
+
+ret:
+  %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+  store i64 %phi, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
index 1a37e3c75fa3..8227d4c873ee 100644
--- a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
+++ b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
@@ -1,11 +1,22 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
 
 
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
+; ALL-LABEL: {{^}}test:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: .end_amd_kernel_code_t
+
+; ALL-NOT: flat_scr
+
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
+
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 define void @test(i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
new file mode 100644
index 000000000000..2c4c07c193af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i1 @llvm.amdgcn.class.f32(float, i32)
+
+; Produces error after adding an implicit def to v_cndmask_b32
+
+; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
+; GCN: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+bb0:
+  %tmp = icmp sgt i32 %arg1, 4
+  %c = icmp eq i32 %arg3, 0
+  %tmp4 = select i1 %c, float %arg, float 1.000000e+00
+  %tmp5 = fcmp ogt float %arg2, 0.000000e+00
+  %tmp6 = fcmp olt float %arg2, 1.000000e+00
+  %tmp7 = fcmp olt float %arg, %tmp4
+  %tmp8 = and i1 %tmp5, %tmp6
+  %tmp9 = and i1 %tmp8, %tmp7
+  br i1 %tmp9, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+; The undef flag on the condition src must be preserved on the
+; implicit vcc use to avoid verifier errors.
+
+; GCN-LABEL: {{^}}preserve_condition_undef_flag:
+; GCN-NOT: vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+bb0:
+  %tmp = icmp sgt i32 %arg1, 4
+  %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
+  %tmp4 = select i1 %undef, float %arg, float 1.000000e+00
+  %tmp5 = fcmp ogt float %arg2, 0.000000e+00
+  %tmp6 = fcmp olt float %arg2, 1.000000e+00
+  %tmp7 = fcmp olt float %arg, %tmp4
+  %tmp8 = and i1 %tmp5, %tmp6
+  %tmp9 = and i1 %tmp8, %tmp7
+  br i1 %tmp9, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/coalescer_distribute.ll b/test/CodeGen/AMDGPU/coalescer_distribute.ll
new file mode 100644
index 000000000000..7ca2612598c8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer_distribute.ll
@@ -0,0 +1,53 @@
+; RUN: llc -o /dev/null %s
+; This testcase produces a situation with unused value numbers in subregister
+; liveranges that get distributed by ConnectedVNInfoEqClasses.
+target triple = "amdgcn--"
+
+define spir_kernel void @hoge() {
+bb:
+  %tmp = tail call i32 @llvm.r600.read.tidig.x()
+  br i1 undef, label %bb2, label %bb23
+
+bb2:
+  br i1 undef, label %bb6, label %bb8
+
+bb6:
+  %tmp7 = or i64 undef, undef
+  br label %bb8
+
+bb8:
+  %tmp9 = phi i64 [ %tmp7, %bb6 ], [ undef, %bb2 ]
+  %tmp10 = icmp eq i32 %tmp, 0
+  br i1 %tmp10, label %bb11, label %bb23
+
+bb11:
+  br i1 undef, label %bb20, label %bb17
+
+bb17:
+  br label %bb20
+
+bb20:
+  %tmp21 = phi i64 [ undef, %bb17 ], [ %tmp9, %bb11 ]
+  %tmp22 = trunc i64 %tmp21 to i32
+  br label %bb23
+
+bb23:
+  %tmp24 = phi i32 [ %tmp22, %bb20 ], [ undef, %bb8 ], [ undef, %bb ]
+  br label %bb25
+
+bb25:
+  %tmp26 = phi i32 [ %tmp24, %bb23 ], [ undef, %bb25 ]
+  br i1 undef, label %bb25, label %bb30
+
+bb30:
+  br i1 undef, label %bb32, label %bb34
+
+bb32:
+  %tmp33 = zext i32 %tmp26 to i64
+  br label %bb34
+
+bb34:
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index 31766047a358..731b47cd9ee2 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; --------------------------------------------------------------------------------
 ; i32 compares
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
 define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -22,7 +22,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
 define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -37,7 +37,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
 define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -50,7 +50,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
 define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -63,7 +63,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
 define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -76,7 +76,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
 define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -89,7 +89,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
 define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -105,7 +105,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
 define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -118,7 +118,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
 define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -131,7 +131,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
 define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -144,7 +144,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
 define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -157,7 +157,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
 define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i32, i32 addrspace(1)* %gep.in
@@ -174,7 +174,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -187,7 +187,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -200,7 +200,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -213,7 +213,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -226,7 +226,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -239,7 +239,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -255,7 +255,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -268,7 +268,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -281,7 +281,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -294,7 +294,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -307,7 +307,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep.in
@@ -325,7 +325,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -339,7 +339,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -352,7 +352,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -365,7 +365,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -378,7 +378,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -391,7 +391,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -404,7 +404,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
 define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -417,7 +417,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -430,7 +430,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -443,7 +443,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -456,7 +456,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -469,7 +469,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -482,7 +482,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
 define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -495,7 +495,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
 define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load float, float addrspace(1)* %gep.in
@@ -513,7 +513,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -527,7 +527,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -540,7 +540,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -553,7 +553,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -566,7 +566,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -579,7 +579,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -592,7 +592,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -605,7 +605,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -618,7 +618,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -631,7 +631,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -644,7 +644,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -657,7 +657,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -670,7 +670,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
 define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
@@ -683,7 +683,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
 define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %val = load double, double addrspace(1)* %gep.in
diff --git a/test/CodeGen/AMDGPU/commute-shifts.ll b/test/CodeGen/AMDGPU/commute-shifts.ll
index f88cf6470c4f..862f236514ca 100644
--- a/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -4,30 +4,26 @@
 ; GCN-LABEL: {{^}}main:
 ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
-
-define void @main() #0 {
-main_body:
-  %0 = fptosi float undef to i32
-  %1 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> undef, <32 x i8> undef, i32 2)
-  %2 = extractelement <4 x i32> %1, i32 0
-  %3 = and i32 %0, 7
-  %4 = shl i32 1, %3
-  %5 = and i32 %2, %4
-  %6 = icmp eq i32 %5, 0
-  %.10 = select i1 %6, float 0.000000e+00, float undef
-  %7 = call i32 @llvm.SI.packf16(float undef, float %.10)
-  %8 = bitcast i32 %7 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %8, float undef, float %8)
+define amdgpu_ps void @main(float %arg0, float %arg1) #0 {
+bb:
+  %tmp = fptosi float %arg0 to i32
+  %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp2.f = extractelement <4 x float> %tmp1, i32 0
+  %tmp2 = bitcast float %tmp2.f to i32
+  %tmp3 = and i32 %tmp, 7
+  %tmp4 = shl i32 1, %tmp3
+  %tmp5 = and i32 %tmp2, %tmp4
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
+  %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
+  %tmp9 = bitcast i32 %tmp8 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9)
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare i32 @llvm.SI.packf16(float, float) #1
-
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
index 7fc36eabb780..bce3fe998c8a 100644
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fabs.f32(float) #1
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 
 ; FUNC-LABEL: @commute_add_imm_fabs_f32
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -21,9 +21,9 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -36,9 +36,9 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; FUNC-LABEL: @commute_mul_imm_fneg_f32
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
   %x.fneg = fsub float -0.000000e+00, %x
@@ -52,9 +52,9 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -67,13 +67,13 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fadd float %x, %y.fabs
   store float %z, float addrspace(1)* %out
@@ -84,13 +84,13 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
   %y.fneg = fsub float -0.000000e+00, %y
   %z = fmul float %x, %y.fneg
   store float %z, float addrspace(1)* %out
@@ -101,13 +101,13 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
   %z = fmul float %x, %y.fabs.fneg
@@ -120,13 +120,13 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fmul float %x.fabs, %y.fabs
@@ -138,13 +138,13 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
@@ -159,16 +159,16 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
 define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r2.fabs = call float @llvm.fabs.f32(float %r2)
 
diff --git a/test/CodeGen/AMDGPU/complex-folding.ll b/test/CodeGen/AMDGPU/complex-folding.ll
index a5399a71324c..acf81ba7b5dd 100644
--- a/test/CodeGen/AMDGPU/complex-folding.ll
+++ b/test/CodeGen/AMDGPU/complex-folding.ll
@@ -2,18 +2,16 @@
 
 ; CHECK: {{^}}main:
 ; CHECK-NOT: MOV
-define void @main(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %reg0) {
 entry:
   %0 = extractelement <4 x float> %reg0, i32 0
   %1 = call float @fabs(float %0)
   %2 = fptoui float %1 to i32
   %3 = bitcast i32 %2 to float
   %4 = insertelement <4 x float> undef, float %3, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %4, i32 0, i32 0)
   ret void
 }
 
 declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
new file mode 100644
index 000000000000..55a38e576ad1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+; GCN-LABEL: {{^}}convergent_inlineasm:
+; GCN: BB#0:
+; GCN: v_cmp_ne_i32_e64
+; GCN: BB#1:
+define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 1) #1
+  %tmp2 = icmp eq i32 %tmp, 8
+  br i1 %tmp2, label %bb3, label %bb5
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
+  store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+  br label %bb5
+
+bb5:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+; GCN-LABEL: {{^}}nonconvergent_inlineasm:
+; GCN: BB#1:
+; GCN: v_cmp_ne_i32_e64
+; GCN: BB1_2:
+define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 1)
+  %tmp2 = icmp eq i32 %tmp, 8
+  br i1 %tmp2, label %bb3, label %bb5
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
+  store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+  br label %bb5
+
+bb5:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { convergent nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 8b397566066a..00d2257f4adc 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -54,31 +54,12 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 }
 
 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; After scalarizing v4i8 loads is fixed.
-; XSI: buffer_load_dword
-; XSI: V_BFE
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI: buffer_store_dword
+; SI: buffer_load_dword
+; SI-DAG: v_lshrrev_b32
+; SI: v_and_b32
+; SI: v_or_b32
+; SI-DAG: buffer_store_dword
+; SI-DAG: buffer_store_dword
 
 ; SI: s_endpgm
 define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
@@ -90,34 +71,14 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 }
 
 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; XSI: buffer_load_dword
-; XSI: BFE
-; XSI: buffer_store_dword
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI-NEXT: buffer_store_dword
-
+; SI: buffer_load_dword
+; SI-DAG: v_lshrrev_b32
+; SI-DAG: v_add_i32
+; SI-DAG: v_and_b32
+; SI-DAG: v_or_b32
+; SI-DAG: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
@@ -128,21 +89,50 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copy_v3i8:
-; SI-NOT: bfe
-; SI-NOT: bfi
+; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
+; SI: buffer_load_dword
+; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; SI: s_endpgm
-define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
+; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; SI: s_endpgm
+define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
 ; SI: buffer_load_ubyte
 ; SI: buffer_load_ubyte
 ; SI: buffer_load_ubyte
 ; SI: buffer_load_ubyte
+; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index baedf47eef0d..6d2d260177e5 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -116,11 +116,11 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
 ; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
 ; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
-; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
 ; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
 ; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
@@ -136,7 +136,8 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64:
-; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
 ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
@@ -145,8 +146,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
 ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
-; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
 define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index c1f84cd460cf..65e8205317b6 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -116,14 +116,14 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
-; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
 ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: buffer_store_dword [[RESULT]],
  define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: buffer_store_dword [[RESULT]],
 define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 0a031c5e24d1..e53ad13464e8 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -60,9 +60,9 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN-NEXT: s_waitcnt
+; GCN: s_waitcnt
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
@@ -203,8 +203,8 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
@@ -250,8 +250,8 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
 ; GCN: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index ec2971e98032..d0976b7d45b8 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
@@ -7,6 +7,9 @@ declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
 declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
 
+declare i65 @llvm.ctpop.i65(i65) nounwind readnone
+declare i128 @llvm.ctpop.i128(i128) nounwind readnone
+
 ; FUNC-LABEL: {{^}}s_ctpop_i64:
 ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
@@ -110,15 +113,13 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
   ret void
 }
 
-; FIXME: We currently disallow SALU instructions in all branches,
-; but there are some cases when the should be allowed.
-
 ; FUNC-LABEL: {{^}}ctpop_i64_in_br:
-; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
 define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
@@ -140,3 +141,51 @@ endif:
   store i64 %tmp5, i64 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}s_ctpop_i128:
+; GCN: s_bcnt1_i32_b64 [[SRESULT0:s[0-9]+]],
+; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
+; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
+; GCN: s_endpgm
+define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
+  %truncctpop = trunc i128 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_i65:
+; GCN: s_and_b32
+; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
+; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
+; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
+; GCN: s_endpgm
+define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+  %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
+  %truncctpop = trunc i65 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Should not have extra add
+
+; FUNC-LABEL: {{^}}v_ctpop_i128:
+; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
+; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
+
+; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
+; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
+
+; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]]
+
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+  %val = load i128, i128 addrspace(1)* %in, align 8
+  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
+  %truncctpop = trunc i128 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll
new file mode 100644
index 000000000000..ab99af5864e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cube.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubeid(float, float, float) #0
+declare float @llvm.amdgcn.cubesc(float, float, float) #0
+declare float @llvm.amdgcn.cubetc(float, float, float) #0
+declare float @llvm.amdgcn.cubema(float, float, float) #0
+
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+
+
+; GCN-LABEL: {{^}}cube:
+; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+  %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+  %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+  %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+  %cubema = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+
+  %vec0 = insertelement <4 x float> undef, float %cubeid, i32 0
+  %vec1 = insertelement <4 x float> %vec0, float %cubesc, i32 1
+  %vec2 = insertelement <4 x float> %vec1, float %cubetc, i32 2
+  %vec3 = insertelement <4 x float> %vec2, float %cubema, i32 3
+  store <4 x float> %vec3, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}legacy_cube:
+; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
+  %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
+  store <4 x float> %cube, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 834922c62cbd..dcd48c97434d 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}load_i8_to_f32:
@@ -15,12 +15,9 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 }
 
 ; SI-LABEL: {{^}}load_v2i8_to_v2f32:
-; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI-NOT: and
-; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_load_ushort [[LD:v[0-9]+]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
@@ -30,11 +27,11 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 }
 
 ; SI-LABEL: {{^}}load_v3i8_to_v3f32:
-; SI-NOT: bfe
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-NOT: v_cvt_f32_ubyte3_e32
-; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32
-; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
@@ -62,20 +59,20 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; This should not be adding instructions to shift into the correct
 ; position in the word for the component.
 
+; FIXME: Packing bytes
 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
 ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
-; SI-NOT: v_lshlrev_b32
-; SI-NOT: v_or_b32
-
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+; SI-DAG: v_lshlrev_b32
+; SI-DAG: v_or_b32
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
-; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI: buffer_store_dwordx4
 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
@@ -83,26 +80,25 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
   ret void
 }
 
-; XXX - This should really still be able to use the v_cvt_f32_ubyte0
-; for each component, but computeKnownBits doesn't handle vectors very
-; well.
-
+; Instructions still emitted to repack bytes for add use.
 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-
-; XXX - replace with this when v4i8 loads aren't scalarized anymore.
-; XSI: buffer_load_dword
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
+; SI: buffer_load_dword
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte3_e32
+
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
+; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
+; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
+; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
+; SI-DAG: v_add_i32
+
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dword
+
 ; SI: s_endpgm
 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
@@ -170,9 +166,9 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
   ret void
 }
 
-
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
+; SI-LABEL: {{^}}i8_zext_i32_to_f32:
 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
@@ -181,6 +177,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
   ret void
 }
 
+; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
@@ -188,3 +185,58 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
+
+; SI-LABEL: {{^}}extract_byte0_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in
+  %and = and i32 %val, 255
+  %cvt = uitofp i32 %and to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}extract_byte1_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in
+  %srl = lshr i32 %val, 8
+  %and = and i32 %srl, 255
+  %cvt = uitofp i32 %and to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}extract_byte2_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in
+  %srl = lshr i32 %val, 16
+  %and = and i32 %srl, 255
+  %cvt = uitofp i32 %and to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}extract_byte3_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in
+  %srl = lshr i32 %val, 24
+  %and = and i32 %srl, 255
+  %cvt = uitofp i32 %and to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
new file mode 100644
index 000000000000..a32c16dfac38
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Test for a bug where DAGCombiner::ReassociateOps() was creating adds
+; with offset in the first operand and base pointers in the second.
+
+; CHECK-LABEL: {{^}}store_same_base_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR:v\[[0-9]+:[0-9]+\]]], [[SADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+
+define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #0
+  %offset = sext i32 %id to i64
+  %offset0 = add i64 %offset, 1027
+  %ptr0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset0
+  store volatile i32 3, i32 addrspace(1)* %ptr0
+  %offset1 = add i64 %offset, 1026
+  %ptr1 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset1
+  store volatile i32 2, i32 addrspace(1)* %ptr1
+  %offset2 = add i64 %offset, 1025
+  %ptr2 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset2
+  store volatile i32 1, i32 addrspace(1)* %ptr2
+  %offset3 = add i64 %offset, 1024
+  %ptr3 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset3
+  store volatile i32 0, i32 addrspace(1)* %ptr3
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
new file mode 100644
index 000000000000..49a7e722f29c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -0,0 +1,80 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
+
+; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
+; CHECK: debug_private_segment_buffer_sgpr = [[SREG:[0-9]+]]
+
+; CHECK: v_mov_b32_e32 [[WGIDX:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDX]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]]
+; CHECK: buffer_store_dword v0, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:16
+
+; CHECK: v_mov_b32_e32 [[WGIDY:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDY]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:4
+; CHECK: buffer_store_dword v1, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:20
+
+; CHECK: v_mov_b32_e32 [[WGIDZ:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDZ]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:8
+; CHECK: buffer_store_dword v2, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:24
+
+; CHECK: DebuggerWavefrontPrivateSegmentOffsetSGPR: s[[SOFF]]
+; CHECK: DebuggerPrivateSegmentBufferSGPR: s[[SREG]]
+
+; NOATTR-NOT: DebuggerWavefrontPrivateSegmentOffsetSGPR
+; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+  store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+  ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 269772)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 269772)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
new file mode 100644
index 000000000000..6638f4e25821
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -0,0 +1,71 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: test01.cl:2:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:3:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:4:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:5:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_endpgm
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+  store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+  ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 268929)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
new file mode 100644
index 000000000000..d30bb20bb03a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -0,0 +1,62 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
+; CHECK: reserved_vgpr_first = {{[0-9]+}}
+; CHECK-NEXT: reserved_vgpr_count = 4
+; CHECK: ReservedVGPRFirst: {{[0-9]+}}
+; CHECK-NEXT: ReservedVGPRCount: 4
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+  store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+  ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 268929)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
index da8e91454b98..723e3c27ad6b 100644
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -1,36 +1,62 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; FUNC-LABEL: {{^}}test_kernel:
+; GCN-LABEL: {{^}}test_default_si:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
 
-; DEFAULT: FloatMode: 192
-; DEFAULT: IeeeMode: 0
+; GCN-LABEL: {{^}}test_default_vi:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
 
-; FP64-DENORMAL: FloatMode: 192
-; FP64-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f64_denormals:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
 
-; FP32-DENORMAL: FloatMode: 48
-; FP32-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f32_denormals:
+; GCNL: FloatMode: 48
+; GCN: IeeeMode: 0
+define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
 
-; BOTH-DENORMAL: FloatMode: 240
-; BOTH-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f32_f64_denormals:
+; GCN: FloatMode: 240
+; GCN: IeeeMode: 0
+define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
 
-; NO-DENORMAL: FloatMode: 0
-; NO-DENORMAL: IeeeMode: 0
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+; GCN-LABEL: {{^}}test_no_denormals
+; GCN: FloatMode: 0
+; GCN: IeeeMode: 0
+define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="tahiti" }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-features"="+fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
new file mode 100644
index 000000000000..f7f953c144da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -0,0 +1,428 @@
+# RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
+--- |
+  define void @test0() { ret void }
+  define void @test1() { ret void }
+  define void @test2() { ret void }
+  define void @test3() { ret void }
+  define void @test4() { ret void }
+  define void @test5() { ret void }
+  define void @loop0() { ret void }
+  define void @loop1() { ret void }
+  define void @loop2() { ret void }
+...
+---
+# Combined use/def transfer check, the basics.
+# CHECK-LABEL: name: test0
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: %3 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %3:sub0
+# CHECK: S_NOP 0, implicit %3:sub1
+# CHECK: S_NOP 0, implicit undef %3:sub2
+# CHECK: %4 = COPY %3:sub0_sub1
+# CHECK: %5 = COPY undef %3:sub2_sub3
+# CHECK: S_NOP 0, implicit %4:sub0
+# CHECK: S_NOP 0, implicit %4:sub1
+# CHECK: S_NOP 0, implicit undef %5:sub0
+name: test0
+isSSA: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sreg_32 }
+  - { id: 3, class: sreg_128 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_64 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def %0
+    S_NOP 0, implicit-def %1
+    S_NOP 0, implicit-def %2
+    %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub3
+    S_NOP 0, implicit %3:sub0
+    S_NOP 0, implicit %3:sub1
+    S_NOP 0, implicit %3:sub2
+    %4 = COPY %3:sub0_sub1
+    %5 = COPY %3:sub2_sub3
+    S_NOP 0, implicit %4:sub0
+    S_NOP 0, implicit %4:sub1
+    S_NOP 0, implicit %5:sub0
+...
+---
+# Check defined lanes transfer; Includes checking for some special cases like
+# undef operands or IMPLICIT_DEF definitions.
+# CHECK-LABEL: name: test1
+# CHECK: %0 = REG_SEQUENCE %sgpr0, {{[0-9]+}}, %sgpr0, {{[0-9]+}}
+# CHECK: %1 = INSERT_SUBREG %0, %sgpr1, {{[0-9]+}}
+# CHECK: %2 = INSERT_SUBREG %0:sub2_sub3, %sgpr42, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %1:sub0
+# CHECK: S_NOP 0, implicit undef %1:sub1
+# CHECK: S_NOP 0, implicit %1:sub2
+# CHECK: S_NOP 0, implicit %1:sub3
+# CHECK: S_NOP 0, implicit %2:sub0
+# CHECK: S_NOP 0, implicit undef %2:sub1
+
+# CHECK: %3 = IMPLICIT_DEF
+# CHECK: %4 = INSERT_SUBREG %0, undef %3, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %4:sub0
+# CHECK: S_NOP 0, implicit undef %4:sub1
+# CHECK: S_NOP 0, implicit %4:sub2
+# CHECK: S_NOP 0, implicit undef %4:sub3
+
+# CHECK: %5 = EXTRACT_SUBREG %0, {{[0-9]+}}
+# CHECK: %6 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: %7 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %5
+# CHECK: S_NOP 0, implicit %6
+# CHECK: S_NOP 0, implicit undef %7
+
+# CHECK: %8 = IMPLICIT_DEF
+# CHECK: %9 = EXTRACT_SUBREG undef %8, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %9
+
+# CHECK: %10 = EXTRACT_SUBREG undef %0, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %10
+name: test1
+isSSA: true
+registers:
+  - { id: 0, class: sreg_128 }
+  - { id: 1, class: sreg_128 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: sreg_32 }
+  - { id: 4, class: sreg_128 }
+  - { id: 5, class: sreg_64 }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+body: |
+  bb.0:
+    %0 = REG_SEQUENCE %sgpr0, %subreg.sub0, %sgpr0, %subreg.sub2
+    %1 = INSERT_SUBREG %0, %sgpr1, %subreg.sub3
+    %2 = INSERT_SUBREG %0:sub2_sub3, %sgpr42, %subreg.sub0
+    S_NOP 0, implicit %1:sub0
+    S_NOP 0, implicit %1:sub1
+    S_NOP 0, implicit %1:sub2
+    S_NOP 0, implicit %1:sub3
+    S_NOP 0, implicit %2:sub0
+    S_NOP 0, implicit %2:sub1
+
+    %3 = IMPLICIT_DEF
+    %4 = INSERT_SUBREG %0, %3, %subreg.sub0
+    S_NOP 0, implicit %4:sub0
+    S_NOP 0, implicit %4:sub1
+    S_NOP 0, implicit %4:sub2
+    S_NOP 0, implicit %4:sub3
+
+    %5 = EXTRACT_SUBREG %0, %subreg.sub0_sub1
+    %6 = EXTRACT_SUBREG %5, %subreg.sub0
+    %7 = EXTRACT_SUBREG %5, %subreg.sub1
+    S_NOP 0, implicit %5
+    S_NOP 0, implicit %6
+    S_NOP 0, implicit %7
+
+    %8 = IMPLICIT_DEF
+    %9 = EXTRACT_SUBREG %8, %subreg.sub1
+    S_NOP 0, implicit %9
+
+    %10 = EXTRACT_SUBREG undef %0, %subreg.sub2_sub3
+    S_NOP 0, implicit %10
+...
+---
+# Check used lanes transfer; Includes checking for some special cases like
+# undef operands.
+# CHECK-LABEL: name: test2
+# CHECK: S_NOP 0, implicit-def dead %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def %2
+# CHECK: %3 = REG_SEQUENCE undef %0, {{[0-9]+}}, %1, {{[0-9]+}}, %2, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %3:sub1
+# CHECK: S_NOP 0, implicit %3:sub3
+
+# CHECK: S_NOP 0, implicit-def %4
+# CHECK: S_NOP 0, implicit-def dead %5
+# CHECK: %6 = REG_SEQUENCE %4, {{[0-9]+}}, undef %5, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %6
+
+# CHECK: S_NOP 0, implicit-def dead %7
+# CHECK: S_NOP 0, implicit-def %8
+# CHECK: %9 = INSERT_SUBREG undef %7, %8, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %9:sub2
+
+# CHECK: S_NOP 0, implicit-def %10
+# CHECK: S_NOP 0, implicit-def dead %11
+# CHECK: %12 = INSERT_SUBREG %10, undef %11, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %12:sub3
+
+# CHECK: S_NOP 0, implicit-def %13
+# CHECK: S_NOP 0, implicit-def dead %14
+# CHECK: %15 = REG_SEQUENCE %13, {{[0-9]+}}, undef %14, {{[0-9]+}}
+# CHECK: %16 = EXTRACT_SUBREG %15, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %16:sub1
+
+name: test2
+isSSA: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: sreg_128 }
+  - { id: 4, class: sreg_32 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_128 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_128 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_64 }
+  - { id: 12, class: sreg_128 }
+  - { id: 13, class: sreg_64 }
+  - { id: 14, class: sreg_64 }
+  - { id: 15, class: sreg_128 }
+  - { id: 16, class: sreg_64 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def %0
+    S_NOP 0, implicit-def %1
+    S_NOP 0, implicit-def %2
+    %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2_sub3
+    S_NOP 0, implicit %3:sub1
+    S_NOP 0, implicit %3:sub3
+
+    S_NOP 0, implicit-def %4
+    S_NOP 0, implicit-def %5
+    %6 = REG_SEQUENCE %4, %subreg.sub0, undef %5, %subreg.sub1
+    S_NOP 0, implicit %6
+
+    S_NOP 0, implicit-def %7
+    S_NOP 0, implicit-def %8
+    %9 = INSERT_SUBREG %7, %8, %subreg.sub2_sub3
+    S_NOP 0, implicit %9:sub2
+
+    S_NOP 0, implicit-def %10
+    S_NOP 0, implicit-def %11
+    %12 = INSERT_SUBREG %10, %11, %subreg.sub0_sub1
+    S_NOP 0, implicit %12:sub3
+
+    S_NOP 0, implicit-def %13
+    S_NOP 0, implicit-def %14
+    %15 = REG_SEQUENCE %13, %subreg.sub0_sub1, %14, %subreg.sub2_sub3
+    %16 = EXTRACT_SUBREG %15, %subreg.sub0_sub1
+    S_NOP 0, implicit %16:sub1
+...
+---
+# Check that copies to physregs use all lanes, copies from physregs define all
+# lanes. So we should not get a dead/undef flag here.
+# CHECK-LABEL: name: test3
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %vcc = COPY %0
+# CHECK: %1 = COPY %vcc
+# CHECK: S_NOP 0, implicit %1
+name: test3
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def %0
+    %vcc = COPY %0
+
+    %1 = COPY %vcc
+    S_NOP 0, implicit %1
+...
+---
+# Check that implicit-def/kill do not count as def/uses.
+# CHECK-LABEL: name: test4
+# CHECK: S_NOP 0, implicit-def dead %0
+# CHECK: KILL undef %0
+# CHECK: %1 = IMPLICIT_DEF
+# CHECK: S_NOP 0, implicit undef %1
+name: test4
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def %0
+    KILL %0
+
+    %1 = IMPLICIT_DEF
+    S_NOP 0, implicit %1
+...
+---
+# Check that unused inputs are marked as undef, even if the vreg itself is
+# used.
+# CHECK-LABEL: name: test5
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %1 = REG_SEQUENCE undef %0, {{[0-9]+}}, %0, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %1:sub1
+name: test5
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_64 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def %0
+    %1 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
+    S_NOP 0, implicit %1:sub1
+...
+---
+# Check "optimistic" dataflow fixpoint in phi-loops.
+# CHECK-LABEL: name: loop0
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def dead %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: %3 = REG_SEQUENCE %0, {{[0-9]+}}, undef %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %4 = PHI %3, %bb.0, %5, %bb.1
+
+# CHECK: bb.2:
+# CHECK:   S_NOP 0, implicit %4:sub0
+# CHECK:   S_NOP 0, implicit undef %4:sub3
+name: loop0
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sreg_32 }
+  - { id: 3, class: sreg_128 }
+  - { id: 4, class: sreg_128 }
+  - { id: 5, class: sreg_128 }
+body: |
+  bb.0:
+    successors: %bb.1
+    S_NOP 0, implicit-def %0
+    S_NOP 0, implicit-def %1
+    S_NOP 0, implicit-def %2
+    %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    %4 = PHI %3, %bb.0, %5, %bb.1
+
+    ; let's swiffle some lanes around for fun...
+    %5 = REG_SEQUENCE %4:sub0, %subreg.sub0, %4:sub2, %subreg.sub1, %4:sub1, %subreg.sub2, %4:sub3, %subreg.sub3
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_NOP 0, implicit %4:sub0
+    S_NOP 0, implicit %4:sub3
+...
+---
+# Check a loop that needs to be traversed multiple times to reach the fixpoint
+# for the used lanes. The example reads sub3 lane at the end, however with each
+# loop iteration we should get 1 more lane marked as we cycles the sublanes
+# along. Sublanes sub0, sub1 and sub3 are rotate in the loop so only sub2
+# should be dead.
+# CHECK-LABEL: name: loop1
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: S_NOP 0, implicit-def %3
+# CHECK: %4 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}, %3, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %5 = PHI %4, %bb.0, %6, %bb.1
+
+# CHECK: %6 = REG_SEQUENCE %5:sub1, {{[0-9]+}}, %5:sub3, {{[0-9]+}}, undef %5:sub2, {{[0-9]+}}, %5:sub0, {{[0-9]+}}
+
+# CHECK: bb.2:
+# CHECK:   S_NOP 0, implicit %6:sub3
+name: loop1
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sreg_32 }
+  - { id: 3, class: sreg_32 }
+  - { id: 4, class: sreg_128 }
+  - { id: 5, class: sreg_128 }
+  - { id: 6, class: sreg_128 }
+body: |
+  bb.0:
+    successors: %bb.1
+    S_NOP 0, implicit-def %0
+    S_NOP 0, implicit-def %1
+    S_NOP 0, implicit-def dead %2
+    S_NOP 0, implicit-def %3
+    %4 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    %5 = PHI %4, %bb.0, %6, %bb.1
+
+    ; rotate lanes, but skip sub2 lane...
+    %6 = REG_SEQUENCE %5:sub1, %subreg.sub0, %5:sub3, %subreg.sub1, %5:sub2, %subreg.sub2, %5:sub0, %subreg.sub3
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_NOP 0, implicit %6:sub3
+...
+---
+# Similar to loop1 test, but check for fixpoint of defined lanes.
+# Lanes are rotate between sub0, sub2, sub3 so only sub1 should be dead/undef.
+# CHECK-LABEL: name: loop2
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %1 = REG_SEQUENCE %0, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %2 = PHI %1, %bb.0, %3, %bb.1
+
+# CHECK: %3 = REG_SEQUENCE %2:sub3, {{[0-9]+}}, undef %2:sub1, {{[0-9]+}}, %2:sub0, {{[0-9]+}}, %2:sub2, {{[0-9]+}}
+
+# CHECK: bb.2:
+# CHECK:   S_NOP 0, implicit %2:sub0
+# CHECK:   S_NOP 0, implicit undef %2:sub1
+# CHECK:   S_NOP 0, implicit %2:sub2
+# CHECK:   S_NOP 0, implicit %2:sub3
+name: loop2
+isSSA: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_32 }
+  - { id: 1, class: sreg_128 }
+  - { id: 2, class: sreg_128 }
+  - { id: 3, class: sreg_128 }
+body: |
+  bb.0:
+    successors: %bb.1
+    S_NOP 0, implicit-def %0
+    %1 = REG_SEQUENCE %0, %subreg.sub0
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    %2 = PHI %1, %bb.0, %3, %bb.1
+
+    ; rotate subreg lanes, skipping sub1
+    %3 = REG_SEQUENCE %2:sub3, %subreg.sub0, %2:sub1, %subreg.sub1, %2:sub0, %subreg.sub2, %2:sub2, %subreg.sub3
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_NOP 0, implicit %2:sub0
+    S_NOP 0, implicit undef %2:sub1
+    S_NOP 0, implicit %2:sub2
+    S_NOP 0, implicit %2:sub3
+...
diff --git a/test/CodeGen/AMDGPU/dot4-folding.ll b/test/CodeGen/AMDGPU/dot4-folding.ll
deleted file mode 100644
index 4df7b63bf98e..000000000000
--- a/test/CodeGen/AMDGPU/dot4-folding.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Exactly one constant vector can be folded into dot4, which means exactly
-; 4 MOV instructions
-; CHECK: {{^}}main:
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-
-define void @main(float addrspace(1)* %out) {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
-  %3 = insertelement <4 x float> undef, float %2, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
-  ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 171883e4c74b..5e1ebfde3e10 100644
--- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -7,12 +7,11 @@
 ; GCN-LABEL: {{^}}reschedule_global_load_lds_store:
 ; GCN: buffer_load_dword
 ; GCN: buffer_load_dword
-; GCN: ds_write_b32
-; GCN: ds_write_b32
+; GCN: ds_write2_b32
 ; GCN: s_endpgm
 define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %gptr0, i32 %idx
   %gep1 = getelementptr i32, i32 addrspace(1)* %gptr1, i32 %idx
@@ -25,7 +24,7 @@ for.body:                                         ; preds = %for.body, %entry
   %gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ]
   %gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ]
   %lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ]
-  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 1
+  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 2
   %val0 = load i32, i32 addrspace(1)* %gep0
   store i32 %val0, i32 addrspace(3)* %lptr0.phi
   %val1 = load i32, i32 addrspace(1)* %gep1
@@ -42,10 +41,7 @@ exit:                                             ; preds = %for.body, %entry
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index e657991557e3..f461d6978f13 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -1,31 +1,31 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.s.barrier() #1
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
 ; CHECK: BB0_1:
 ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
-; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 4, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
 ; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], vcc, 0x84, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
 define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
   br label %for.body
 
@@ -33,16 +33,16 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
   %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
   %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  tail call void @llvm.AMDGPU.barrier.local() #1
+  tail call void @llvm.amdgcn.s.barrier() #1
   %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
   %tmp = load float, float addrspace(3)* %arrayidx, align 4
-  %add1 = add nsw i32 %offset.02, 1
+  %add1 = add nsw i32 %offset.02, 2
   %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
   %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
   %add3 = add nsw i32 %offset.02, 32
   %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
   %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
-  %add5 = add nsw i32 %offset.02, 33
+  %add5 = add nsw i32 %offset.02, 34
   %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
   %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
   %add7 = add nsw i32 %offset.02, 64
@@ -67,4 +67,4 @@ for.end:                                          ; preds = %for.body
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { convergent nounwind }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 7d6eddb01993..16fb019ae0f3 100644
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 
-declare void @llvm.AMDGPU.barrier.local() #2
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 @lds.obj = addrspace(3) global [256 x i32] undef, align 4
 
@@ -12,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
 define void @write_ds_sub0_offset0_global() #0 {
 entry:
-  %x.i = call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
   %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
   %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
@@ -26,7 +25,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
 define void @add_x_shl_neg_to_sub_max_offset() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 65535, %shl
@@ -41,7 +40,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 {
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
 define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 65536, %shl
@@ -60,7 +59,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
 define void @add_x_shl_neg_to_sub_multi_use() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add0 = add i32 123, %shl
@@ -82,7 +81,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 {
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
 define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 123, %shl
@@ -97,7 +96,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
 define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 1019, %shl
@@ -111,7 +110,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
 define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 1020, %shl
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 5170d9c82712..6e30cff9609d 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -13,7 +13,7 @@
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f32(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -32,7 +32,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
@@ -50,7 +50,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
 define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 257
@@ -67,7 +67,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -99,7 +99,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -109,7 +109,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
-  call void @llvm.AMDGPU.barrier.local() #2
+  call void @llvm.amdgcn.s.barrier() #2
 
   %idx.2 = add nsw i32 %tid.x, 11
   %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
@@ -134,7 +134,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -171,7 +171,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32
 ; SI: s_endpgm
 define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
   %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
@@ -197,7 +197,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
 ; SI: ds_read_b32
 ; SI: s_endpgm
 define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
   %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
@@ -220,7 +220,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
 define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
   %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
@@ -244,7 +244,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
 define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -262,7 +262,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
 define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -281,7 +281,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
 define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
   %add.x = add nsw i32 %x.i, 8
@@ -297,7 +297,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
 define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
   %add.x = add nsw i32 %x.i, 8
@@ -316,7 +316,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f64(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
@@ -332,7 +332,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
 define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 255
@@ -350,7 +350,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
 define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 257
@@ -368,7 +368,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 7
@@ -438,8 +438,8 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
 define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
-  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
+  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
   %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
   %add47 = add nsw i32 %x.i, 1
@@ -494,20 +494,20 @@ define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in)
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 ; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index d362c46bbf96..57e190e0cca0 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -8,9 +8,8 @@
 
 ; SI-LABEL: {{^}}offset_order:
 
-; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
 
 define void @offset_order(float addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 0061aaf2cdbd..9d8375d64037 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -13,7 +13,7 @@
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
 define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
   %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
@@ -27,7 +27,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
 define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
   %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
@@ -44,7 +44,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
   %elt0 = extractelement <4 x float> %val0, i32 0
@@ -69,7 +69,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
 define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
   %elt0 = extractelement <3 x float> %val0, i32 0
@@ -85,17 +85,11 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
 }
 
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
-; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-
-; FIXME: These moves shouldn't be necessary, it should be able to
-; store the same register if offset1 was the non-zero offset.
-
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
+; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
   %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
@@ -104,13 +98,11 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
 }
 
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
+; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
   %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
@@ -120,17 +112,13 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
 
 ; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
+; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
 define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
   %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i
@@ -140,25 +128,18 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
 
 ; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-
+; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}}
+; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}}
 ; CI: s_waitcnt lgkmcnt(0)
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
+; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
+; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
+; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
 define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
   %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i
@@ -173,7 +154,7 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
 define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
 
@@ -196,7 +177,7 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
 define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
   %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2
@@ -218,20 +199,11 @@ define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspa
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 4a0571ea16f2..7a8a206033ba 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -11,7 +11,7 @@
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
@@ -30,7 +30,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -44,13 +44,13 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
 }
 
 ; SI-LABEL: @simple_read2st64_f32_max_offset
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:255 offset1:1
 ; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -65,12 +65,12 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 
 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
 ; SI-NOT: ds_read2st64_b32
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
-; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -87,7 +87,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
 define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 63
@@ -103,7 +103,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
 define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -123,7 +123,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 64
@@ -142,7 +142,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -162,7 +162,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
 define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
@@ -176,13 +176,13 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
 
 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
 ; SI-LABEL: @simple_read2st64_f64_max_offset
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:127 offset1:4
 ; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}, v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -197,12 +197,12 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -219,7 +219,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
 define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -240,7 +240,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
@@ -253,16 +253,10 @@ define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, do
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index 9d3a293f3b89..45fcc01b2add 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -10,7 +10,7 @@
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -28,11 +28,11 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -47,11 +47,11 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -66,11 +66,11 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
@@ -87,11 +87,11 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
-  %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
-  %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
+  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
   %val0.0 = extractelement <2 x float> %val0, i32 0
   %val1.1 = extractelement <2 x float> %val1, i32 1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -108,7 +108,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
   %val0 = extractelement <2 x float> %val, i32 0
@@ -127,7 +127,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
   %val0 = extractelement <4 x float> %val, i32 0
@@ -147,11 +147,11 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
@@ -165,7 +165,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
 define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
   %val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -179,11 +179,11 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
   %val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -209,11 +209,11 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
   %val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -244,7 +244,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
 ; SI: ds_write_b32
 ; SI: s_endpgm
 define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
   %val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -271,7 +271,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
@@ -289,7 +289,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
@@ -307,11 +307,11 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   store double %val0, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
@@ -372,8 +372,8 @@ define void @store_misaligned64_constant_large_offsets() {
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
 define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
-  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
+  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
   store float %val, float addrspace(3)* %arrayidx44, align 4
@@ -411,7 +411,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
 define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
   %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
@@ -420,20 +420,17 @@ define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out,
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 5a1024ccf6d7..872e77361406 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
 define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -26,11 +26,11 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
   %add.x.0 = add nsw i32 %x.i, 128
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
   store float %val0, float addrspace(3)* %arrayidx0, align 4
@@ -47,11 +47,11 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 16320
@@ -67,11 +67,11 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   store double %val0, double addrspace(3)* %arrayidx0, align 8
@@ -86,7 +86,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
-  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
@@ -98,20 +98,11 @@ define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C,
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index f4409a0984a9..580dc00f935e 100644
--- a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2,7 +2,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported dynamic alloca in test_dynamic_stackalloc
+; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
 define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
index 90af6782c4b4..c62e57c6eaac 100644
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -22,9 +22,9 @@
 ; CONFIG-NEXT: .long   45096
 ; TYPICAL-NEXT: .long   0
 ; TONGA-NEXT: .long   576
-; CONFIG: .align 256
+; CONFIG: .p2align 8
 ; CONFIG: test:
-define void @test(i32 %p) #0 {
+define amdgpu_ps void @test(i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
@@ -32,5 +32,3 @@ define void @test(i32 %p) #0 {
 }
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index 267a323c5063..c67095438ee5 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,8 +12,9 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define void @test(i32 addrspace(1)* %out) {
 entry:
+  %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %loop
 
@@ -32,3 +33,7 @@ done:
   store i32 %inc, i32 addrspace(1)* %tmp3
   ret void
 }
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
index 294c3a9c6782..3f27370d7037 100644
--- a/test/CodeGen/AMDGPU/extload-private.ll
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
 ; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
@@ -39,7 +39,7 @@ entry:
 define void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
-  %tmp1 = load i16, i16* %tmp0
+  %tmp1 = load volatile i16, i16* %tmp0
   %tmp2 = zext i16 %tmp1 to i32
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
index 662eb7a9716b..2cb5cf0422dc 100644
--- a/test/CodeGen/AMDGPU/extload.ll
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -1,53 +1,65 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: This seems to not ever actually become an extload
+; FUNC-LABEL: {{^}}global_anyext_load_i8:
+; GCN: buffer_load_dword v{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}
 
-; FUNC-LABEL: {{^}}anyext_load_i8:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
   %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
-  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut
   ret void
 }
 
-; FUNC-LABEL: {{^}}anyext_load_i16:
+; FUNC-LABEL: {{^}}global_anyext_load_i16:
+; GCN: buffer_load_dword v{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}
+
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
   %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
-  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut
   ret void
 }
 
-; FUNC-LABEL: {{^}}anyext_load_lds_i8:
+; FUNC-LABEL: {{^}}local_anyext_load_i8:
+; GCN: ds_read_b32 v{{[0-9]+}}
+; GCN: ds_write_b32 v{{[0-9]+}}
+
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
   %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
-  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut
   ret void
 }
 
-; FUNC-LABEL: {{^}}anyext_load_lds_i16:
+; FUNC-LABEL: {{^}}local_anyext_load_i16:
+; GCN: ds_read_b32 v{{[0-9]+}}
+; GCN: ds_write_b32 v{{[0-9]+}}
+
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>
   %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
-  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
new file mode 100644
index 000000000000..4edff152e66e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -0,0 +1,126 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+                                                    <4 x i32> addrspace(1)* noalias %out1,
+                                                    i32 addrspace(1)* noalias %out2,
+                                                    i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1
+
+  %extract0 = extractelement <4 x i32> %vec3, i32 0
+  %extract1 = extractelement <4 x i32> %vec3, i32 1
+  %extract2 = extractelement <4 x i32> %vec3, i32 2
+  %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+  store volatile i32 %extract0, i32 addrspace(1)* %out2
+  store volatile i32 %extract1, i32 addrspace(1)* %out2
+  store volatile i32 %extract2, i32 addrspace(1)* %out2
+  store volatile i32 %extract3, i32 addrspace(1)* %out2
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_extract_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+                                                            <4 x i32> addrspace(1)* noalias %out1,
+                                                            i32 addrspace(1)* noalias %out2,
+                                                            i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  %extract0 = extractelement <4 x i32> %vec3, i32 0
+  %extract1 = extractelement <4 x i32> %vec3, i32 1
+  %extract2 = extractelement <4 x i32> %vec3, i32 2
+  %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+  %op0 = add i32 %extract0, 3
+  %op1 = sub i32 %extract1, 9
+  %op2 = xor i32 %extract2, 1231412
+  %op3 = and i32 %extract3, 258233412312
+
+  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+
+  store volatile i32 %op0, i32 addrspace(1)* %out2
+  store volatile i32 %op1, i32 addrspace(1)* %out2
+  store volatile i32 %op2, i32 addrspace(1)* %out2
+  store volatile i32 %op3, i32 addrspace(1)* %out2
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+                                                                     <4 x i32> addrspace(1)* noalias %out1,
+                                                                     i64 addrspace(1)* noalias %out2,
+                                                                     i32 addrspace(1)* %in) {
+  %elt0 = load volatile i32, i32 addrspace(1)* %in
+  %elt1 = load volatile i32, i32 addrspace(1)* %in
+  %elt2 = load volatile i32, i32 addrspace(1)* %in
+  %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+  %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+  %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64>
+  store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0
+
+  %extract0 = extractelement <2 x i64> %bc.vec3, i32 0
+  %extract1 = extractelement <2 x i64> %bc.vec3, i32 1
+
+  store volatile i64 %extract0, i64 addrspace(1)* %out2
+  store volatile i64 %extract1, i64 addrspace(1)* %out2
+
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
new file mode 100644
index 000000000000..d0b19c825ee9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx2
+; GCN: buffer_store_dwordx2
+define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+  %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
+  %elt = extractelement <3 x double> %ld, i32 2
+  store volatile double %elt, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
+define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+  %dynelt = extractelement <3 x double> %foo, i32 %elt
+  store volatile double %dynelt, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
+define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+  %dynelt = extractelement <4 x double> %foo, i32 %elt
+  store volatile double %dynelt, double addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
new file mode 100644
index 000000000000..e012cb07163b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) #0 {
+  %p0 = extractelement <2 x i16> %foo, i32 0
+  %p1 = extractelement <2 x i16> %foo, i32 1
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v3i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+  %p0 = extractelement <3 x i16> %foo, i32 0
+  %p1 = extractelement <3 x i16> %foo, i32 2
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+  %p0 = extractelement <4 x i16> %foo, i32 0
+  %p1 = extractelement <4 x i16> %foo, i32 2
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_store_short
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+  %p0 = extractelement <3 x i16> %foo, i32 %idx
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p0, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_store_short
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+  %p0 = extractelement <4 x i16> %foo, i32 %idx
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p0, i16 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index e32559139623..0a51c39f026f 100644
--- a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; How the replacement of i64 stores with v2i32 stores resulted in
 ; breaking other users of the bitcast if they already existed
@@ -7,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) nounwind {
+define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -18,8 +19,8 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa
   ret void
 }
 
-
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind {
+; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
+define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -28,16 +29,34 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) no
   ret void
 }
 
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind {
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
+define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind {
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
+define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
+define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+  %dynelt = extractelement <3 x i64> %foo, i32 %elt
+  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
+define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+  %dynelt = extractelement <4 x i64> %foo, i32 %elt
+  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
new file mode 100644
index 000000000000..9005bfa07c2b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -0,0 +1,151 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+  %p0 = extractelement <1 x i8> %foo, i32 0
+  store i8 %p0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+  %p0 = extractelement <2 x i8> %foo, i32 0
+  %p1 = extractelement <2 x i8> %foo, i32 1
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v3i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+  %p0 = extractelement <3 x i8> %foo, i32 0
+  %p1 = extractelement <3 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+  %p0 = extractelement <4 x i8> %foo, i32 0
+  %p1 = extractelement <4 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v8i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+  %p0 = extractelement <8 x i8> %foo, i32 0
+  %p1 = extractelement <8 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v16i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+  %p0 = extractelement <16 x i8> %foo, i32 0
+  %p1 = extractelement <16 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v32i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+  %p0 = extractelement <32 x i8> %foo, i32 0
+  %p1 = extractelement <32 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v64i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+  %p0 = extractelement <64 x i8> %foo, i32 0
+  %p1 = extractelement <64 x i8> %foo, i32 2
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p1, i8 addrspace(1)* %out
+  store i8 %p0, i8 addrspace(1)* %out1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+  %p0 = extractelement <3 x i8> %foo, i32 %idx
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+  %p0 = extractelement <4 x i8> %foo, i32 %idx
+  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i8 %p0, i8 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
deleted file mode 100644
index c7572efc6f5b..000000000000
--- a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
-  %p0 = extractelement <2 x i16> %foo, i32 0
-  %p1 = extractelement <2 x i16> %foo, i32 1
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
-  ret void
-}
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
-  %p0 = extractelement <4 x i16> %foo, i32 0
-  %p1 = extractelement <4 x i16> %foo, i32 2
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
new file mode 100644
index 000000000000..e160c20a03a0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure the add and load are reduced to 32-bits even with the
+; bitcast to vector.
+; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0:
+; GCN-DAG: s_load_dword [[B:s[0-9]+]]
+; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
+; GCN: buffer_store_dword [[ADD]]
+define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+   %a = load i64, i64 addrspace(1)* %in
+   %add = add i64 %a, %b
+   %val.bc = bitcast i64 %add to <2 x i32>
+   %extract = extractelement <2 x i32> %val.bc, i32 0
+   store i32 %extract, i32 addrspace(1)* %out
+   ret void
+}
+
+; GCN-LABEL: {{^}}bitcast_fp_to_vector_extract_0:
+; GCN: buffer_load_dwordx2
+; GCN: v_add_f64
+; GCN: buffer_store_dword v
+define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+   %a = load double, double addrspace(1)* %in
+   %add = fadd double %a, %b
+   %val.bc = bitcast double %add to <2 x i32>
+   %extract = extractelement <2 x i32> %val.bc, i32 0
+   store i32 %extract, i32 addrspace(1)* %out
+   ret void
+}
+
+; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0:
+; GCN: buffer_load_dwordx2
+; GCN: v_add_i32
+; GCN: buffer_store_dword
+define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+   %a = load i64, i64 addrspace(1)* %in
+   %add = add i64 %a, %b
+   %val.bc = bitcast i64 %add to <2 x float>
+   %extract = extractelement <2 x float> %val.bc, i32 0
+   store float %extract, float addrspace(1)* %out
+   ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %elt0 = extractelement <4 x i32> %vec, i32 0
+  store i32 %elt0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_extract2:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+
+define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %elt2 = extractelement <4 x i32> %vec, i32 2
+  store i32 %elt2, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %eltN = extractelement <4 x i32> %vec, i32 %idx
+  store i32 %eltN, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
index 3c6136c1a7bd..db8093047a36 100644
--- a/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 declare double @fabs(double) readnone
 declare double @llvm.fabs.f64(double) readnone
@@ -11,7 +11,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 ; SI: v_and_b32
 ; SI: s_endpgm
 define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
   %val = load double, double addrspace(1)* %gep, align 8
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
index 5fac328c5981..11436794ac98 100644
--- a/test/CodeGen/AMDGPU/fadd.ll
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
new file mode 100644
index 000000000000..981d88dfe94e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -0,0 +1,351 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.canonicalize.f64(double) #0
+
+; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
+; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %canonicalized = call float @llvm.canonicalize.f32(float %val)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float %val)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
+; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %canonicalized = call double @llvm.canonicalize.f64(double %val)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double %val)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index c8ef5b101c4d..fb5853b808e4 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
@@ -12,11 +12,11 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; FUNC-LABEL: {{^}}fceil_f64:
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_not_b64
+; SI-DAG: s_and_b64
 ; SI-DAG: cmp_gt_i32
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
@@ -25,8 +25,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI-DAG: cndmask_b32
 ; SI-DAG: v_cmp_lt_f64
 ; SI-DAG: v_cmp_lg_f64
-; SI: s_and_b64
-; SI: v_cndmask_b32
+; SI-DAG: v_cndmask_b32
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 3d8c55993089..738a35fb3b89 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -13,8 +13,8 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
 ; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
 define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
index 7c022e38c808..3343b681b9fe 100644
--- a/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -4,8 +4,8 @@
 
 
 ; COMMON-LABEL: {{^}}fdiv_f64:
-; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
-; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
 ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
 
@@ -31,8 +31,8 @@
 ; COMMON: s_endpgm
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
-  %num = load double, double addrspace(1)* %in
-  %den = load double, double addrspace(1)* %gep.1
+  %num = load volatile double, double addrspace(1)* %in
+  %den = load volatile double, double addrspace(1)* %gep.1
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 7cbf87336399..4021233e7785 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -1,19 +1,32 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
 ; instruction groups.
 
+; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
+
 ; FUNC-LABEL: {{^}}fdiv_f32:
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
 
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
+
+; I754-DAG: v_div_scale_f32
+; I754-DAG: v_rcp_f32
+; I754-DAG: v_fma_f32
+; I754-DAG: v_mul_f32
+; I754-DAG: v_fma_f32
+; I754-DAG: v_div_fixup_f32
 define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fdiv float %a, %b
@@ -21,7 +34,37 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv fast float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv arcp float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
 
 ; FUNC-LABEL: {{^}}fdiv_v2f32:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
@@ -29,10 +72,22 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
+
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fdiv <2 x float> %a, %b
@@ -40,6 +95,50 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv fast <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv arcp <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fdiv_v4f32:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -50,6 +149,15 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
 ; SI-DAG: v_rcp_f32
@@ -58,6 +166,19 @@ entry:
 ; SI-DAG: v_mul_f32
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
+
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -66,3 +187,75 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv fast <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv arcp <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
index e7160ef5d726..5cb0c616d15f 100644
--- a/test/CodeGen/AMDGPU/fetch-limits.r600.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
@@ -7,42 +7,50 @@
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 
-define void @fetch_limits_r600() #0 {
+define amdgpu_ps void @fetch_limits_r600() {
 entry:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %a
-
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %tmp7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp9 = shufflevector <4 x float> %tmp, <4 x float> %tmp, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp10 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp11 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp12 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp13 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp14 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp15 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp16 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp17 = shufflevector <4 x float> %tmp4, <4 x float> %tmp4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp18 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp19 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp20 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp21 = shufflevector <4 x float> %tmp6, <4 x float> %tmp6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp22 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp23 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp24 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp25 = shufflevector <4 x float> %tmp8, <4 x float> %tmp8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp26 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %a = fadd <4 x float> %tmp10, %tmp12
+  %b = fadd <4 x float> %tmp14, %tmp16
+  %c = fadd <4 x float> %tmp18, %tmp20
+  %d = fadd <4 x float> %tmp22, %tmp24
+  %e = fadd <4 x float> %tmp26, %a
   %bc = fadd <4 x float> %b, %c
   %de = fadd <4 x float> %d, %e
-
   %bcde = fadd <4 x float> %bc, %de
-
-  call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
index acaea2aa7943..d8f7c0daa8de 100644
--- a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
@@ -16,7 +16,7 @@
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 
-define void @fetch_limits_r700() #0 {
+define amdgpu_ps void @fetch_limits_r700() {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(8)* null
   %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
@@ -35,47 +35,63 @@ entry:
   %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
   %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
   %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
-  %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
-  %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
-  %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
-  %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
-  %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
-  %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
-  %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %res9
-  %f = fadd <4 x float> %res10, %res11
-  %g = fadd <4 x float> %res12, %res13
-  %h = fadd <4 x float> %res14, %res15
-  %i = fadd <4 x float> %res16, %a
-
+  %17 = shufflevector <4 x float> %0, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %18 = call <4 x float> @llvm.r600.tex(<4 x float> %17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %19 = shufflevector <4 x float> %1, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %20 = call <4 x float> @llvm.r600.tex(<4 x float> %19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %21 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %22 = call <4 x float> @llvm.r600.tex(<4 x float> %21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %23 = shufflevector <4 x float> %3, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %24 = call <4 x float> @llvm.r600.tex(<4 x float> %23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %25 = shufflevector <4 x float> %4, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %26 = call <4 x float> @llvm.r600.tex(<4 x float> %25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %27 = shufflevector <4 x float> %5, <4 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %28 = call <4 x float> @llvm.r600.tex(<4 x float> %27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %29 = shufflevector <4 x float> %6, <4 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %30 = call <4 x float> @llvm.r600.tex(<4 x float> %29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %31 = shufflevector <4 x float> %7, <4 x float> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %32 = call <4 x float> @llvm.r600.tex(<4 x float> %31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %33 = shufflevector <4 x float> %8, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %34 = call <4 x float> @llvm.r600.tex(<4 x float> %33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %35 = shufflevector <4 x float> %9, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %36 = call <4 x float> @llvm.r600.tex(<4 x float> %35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %37 = shufflevector <4 x float> %10, <4 x float> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %38 = call <4 x float> @llvm.r600.tex(<4 x float> %37, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %39 = shufflevector <4 x float> %11, <4 x float> %11, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %40 = call <4 x float> @llvm.r600.tex(<4 x float> %39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %41 = shufflevector <4 x float> %12, <4 x float> %12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %42 = call <4 x float> @llvm.r600.tex(<4 x float> %41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %43 = shufflevector <4 x float> %13, <4 x float> %13, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %44 = call <4 x float> @llvm.r600.tex(<4 x float> %43, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %45 = shufflevector <4 x float> %14, <4 x float> %14, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %46 = call <4 x float> @llvm.r600.tex(<4 x float> %45, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %47 = shufflevector <4 x float> %15, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %48 = call <4 x float> @llvm.r600.tex(<4 x float> %47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %49 = shufflevector <4 x float> %16, <4 x float> %16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %50 = call <4 x float> @llvm.r600.tex(<4 x float> %49, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %a = fadd <4 x float> %18, %20
+  %b = fadd <4 x float> %22, %24
+  %c = fadd <4 x float> %26, %28
+  %d = fadd <4 x float> %30, %32
+  %e = fadd <4 x float> %34, %36
+  %f = fadd <4 x float> %38, %40
+  %g = fadd <4 x float> %42, %44
+  %h = fadd <4 x float> %46, %48
+  %i = fadd <4 x float> %50, %a
   %bc = fadd <4 x float> %b, %c
   %de = fadd <4 x float> %d, %e
   %fg = fadd <4 x float> %f, %g
   %hi = fadd <4 x float> %h, %i
-
   %bcde = fadd <4 x float> %bc, %de
   %fghi = fadd <4 x float> %fg, %hi
-
   %bcdefghi = fadd <4 x float> %bcde, %fghi
-  call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
index 45f8382c3929..ea708a2b7bbd 100644
--- a/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
@@ -13,8 +13,8 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 ; FUNC-LABEL: {{^}}ffloor_f64:
 ; CI: v_floor_f64_e32
 ; SI: v_fract_f64_e32
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_add_f64
@@ -28,8 +28,8 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ffloor_f64_neg:
 ; CI: v_floor_f64_e64
 ; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
@@ -44,8 +44,8 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
 ; CI: v_floor_f64_e64
 ; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e64
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
@@ -67,15 +67,16 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   ret void
 }
 
-; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
+; FUNC-LABEL: {{^}}ffloor_v3f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI-NOT: v_floor_f64_e32
+define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+  %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+  store <3 x double> %y, <3 x double> addrspace(1)* %out
+  ret void
+}
 
 ; FUNC-LABEL: {{^}}ffloor_v4f64:
 ; CI: v_floor_f64_e32
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 86e0c07323bb..5ca57fd3d350 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,7 +17,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
 define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   store i32 %x, i32 addrspace(4)* %fptr, align 4
@@ -127,9 +127,6 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
   ret void
 }
 
-declare void @llvm.AMDGPU.barrier.local() #1
-declare i32 @llvm.r600.read.tidig.x() #3
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind convergent }
 attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll
new file mode 100644
index 000000000000..7400dbcf8909
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -0,0 +1,968 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}atomic_add_i32_offset:
+; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_addr64:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_offset:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
+; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret:
+; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_addr64:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_offset:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_offset:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_addr64:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_offset:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_offset:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_addr64:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_offset:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret:
+; GCN: flat_atomic_umin v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
+  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_offset:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_addr64:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32:
+; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; CMP_SWAP
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %flag = extractvalue { i32, i1 } %val, 0
+  store i32 %flag, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val  = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val  = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %flag = extractvalue { i32, i1 } %val, 0
+  store i32 %flag, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+entry:
+  %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+entry:
+  %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
+  %flag = extractvalue { i32, i1 } %val, 0
+  store i32 %flag, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %flag = extractvalue { i32, i1 } %val, 0
+  store i32 %flag, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_offset:
+; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32:
+; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+  %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
+; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_offset:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
+  %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_addr64:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
+  %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_offset:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+  store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+entry:
+  store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_addr64:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+  store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
new file mode 100644
index 000000000000..0bd6c2dd5b86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -0,0 +1,975 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}atomic_add_i64_offset:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_offset:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_offset:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_offset:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_offset:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_offset:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_offset:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_offset:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_offset:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_offset:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
+  %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
+  store i64 %val, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+entry:
+  %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
+  store i64 %val, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8
+  store i64 %val, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_addr64:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
+  %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
+  store i64 %val, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_offset:
+; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
+define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+entry:
+  store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_addr64:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
+  %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+  %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+  %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+  %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+entry:
+  %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(4)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+  %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(4)* %out2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/floor.ll b/test/CodeGen/AMDGPU/floor.ll
index c6bfb8567a0f..43e58b942220 100644
--- a/test/CodeGen/AMDGPU/floor.ll
+++ b/test/CodeGen/AMDGPU/floor.ll
@@ -1,15 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
 ; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
    %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @floor(float) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index 6f3437048ed8..19deefe4d4a5 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare double @llvm.fabs.f64(double) #0
 declare double @llvm.fma.f64(double, double, double) #0
 declare float @llvm.fma.f32(float, float, float) #0
@@ -14,15 +14,15 @@ declare float @llvm.fma.f32(float, float, float) #0
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
 
   %mul = fmul double %a, %b
   %fma = fadd double %mul, %c
@@ -42,7 +42,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
 define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -50,16 +50,16 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
+  %d = load volatile double, double addrspace(1)* %gep.3
 
   %mul = fmul double %a, %b
   %fma0 = fadd double %mul, %c
   %fma1 = fadd double %mul, %d
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
+  store volatile double %fma0, double addrspace(1)* %gep.out.0
+  store volatile double %fma1, double addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -71,15 +71,15 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
 
   %mul = fmul double %a, %b
   %fma = fadd double %c, %mul
@@ -95,15 +95,15 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
 
   %mul = fmul double %a, %b
   %fma = fsub double %mul, %c
@@ -123,7 +123,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
 define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -131,16 +131,16 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
+  %d = load volatile double, double addrspace(1)* %gep.3
 
   %mul = fmul double %a, %b
   %fma0 = fsub double %mul, %c
   %fma1 = fsub double %mul, %d
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
+  store volatile double %fma0, double addrspace(1)* %gep.out.0
+  store volatile double %fma1, double addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -152,15 +152,15 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
 
   %mul = fmul double %a, %b
   %fma = fsub double %c, %mul
@@ -180,7 +180,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
 define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -188,16 +188,16 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
+  %d = load volatile double, double addrspace(1)* %gep.3
 
   %mul = fmul double %a, %b
   %fma0 = fsub double %c, %mul
   %fma1 = fsub double %d, %mul
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
+  store volatile double %fma0, double addrspace(1)* %gep.out.0
+  store volatile double %fma1, double addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -209,15 +209,15 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
 
   %mul = fmul double %a, %b
   %mul.neg = fsub double -0.0, %mul
@@ -238,7 +238,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
 define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -246,18 +246,18 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
+  %d = load volatile double, double addrspace(1)* %gep.3
 
   %mul = fmul double %a, %b
   %mul.neg = fsub double -0.0, %mul
   %fma0 = fsub double %mul.neg, %c
   %fma1 = fsub double %mul.neg, %d
 
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
+  store volatile double %fma0, double addrspace(1)* %gep.out.0
+  store volatile double %fma1, double addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -272,7 +272,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
 define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -280,18 +280,18 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
   %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0
-  %b = load double, double addrspace(1)* %gep.1
-  %c = load double, double addrspace(1)* %gep.2
-  %d = load double, double addrspace(1)* %gep.3
+  %a = load volatile double, double addrspace(1)* %gep.0
+  %b = load volatile double, double addrspace(1)* %gep.1
+  %c = load volatile double, double addrspace(1)* %gep.2
+  %d = load volatile double, double addrspace(1)* %gep.3
 
   %mul = fmul double %a, %b
   %mul.neg = fsub double -0.0, %mul
   %fma0 = fsub double %mul.neg, %c
   %fma1 = fsub double %mul, %d
 
-  store double %fma0, double addrspace(1)* %gep.out.0
-  store double %fma1, double addrspace(1)* %gep.out.1
+  store volatile double %fma0, double addrspace(1)* %gep.out.0
+  store volatile double %fma1, double addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -307,7 +307,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -315,11 +315,11 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %x = load double, double addrspace(1)* %gep.0
-  %y = load double, double addrspace(1)* %gep.1
-  %z = load double, double addrspace(1)* %gep.2
-  %u = load double, double addrspace(1)* %gep.3
-  %v = load double, double addrspace(1)* %gep.4
+  %x = load volatile double, double addrspace(1)* %gep.0
+  %y = load volatile double, double addrspace(1)* %gep.1
+  %z = load volatile double, double addrspace(1)* %gep.2
+  %u = load volatile double, double addrspace(1)* %gep.3
+  %v = load volatile double, double addrspace(1)* %gep.4
 
   %tmp0 = fmul double %u, %v
   %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
@@ -342,7 +342,7 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -350,11 +350,11 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
   %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
 
-  %x = load double, double addrspace(1)* %gep.0
-  %y = load double, double addrspace(1)* %gep.1
-  %z = load double, double addrspace(1)* %gep.2
-  %u = load double, double addrspace(1)* %gep.3
-  %v = load double, double addrspace(1)* %gep.4
+  %x = load volatile double, double addrspace(1)* %gep.0
+  %y = load volatile double, double addrspace(1)* %gep.1
+  %z = load volatile double, double addrspace(1)* %gep.2
+  %u = load volatile double, double addrspace(1)* %gep.3
+  %v = load volatile double, double addrspace(1)* %gep.4
 
   %tmp0 = fmul double %u, %v
   %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
@@ -373,8 +373,8 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
 define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
-  %x = load float, float addrspace(1)* %in1
-  %y = load float, float addrspace(1)* %in2
+  %x = load volatile float, float addrspace(1)* %in1
+  %y = load volatile float, float addrspace(1)* %in2
   %a = fadd float %x, 1.0
   %m = fmul float %a, %y
   store float %m, float addrspace(1)* %out
@@ -386,8 +386,8 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
 define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
-  %x = load float, float addrspace(1)* %in1
-  %y = load float, float addrspace(1)* %in2
+  %x = load volatile float, float addrspace(1)* %in1
+  %y = load volatile float, float addrspace(1)* %in2
   %a = fadd float %x, 1.0
   %m = fmul float %y, %a
   store float %m, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index d6024aa0b4c5..d04a5946b98c 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -61,7 +61,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
-; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
 define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmad.ll b/test/CodeGen/AMDGPU/fmad.ll
index 935e35123f45..9c39bee753be 100644
--- a/test/CodeGen/AMDGPU/fmad.ll
+++ b/test/CodeGen/AMDGPU/fmad.ll
@@ -2,18 +2,16 @@
 
 ;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = extractelement <4 x float> %reg0, i32 2
    %r3 = fmul float %r0, %r1
    %r4 = fadd float %r3, %r2
    %vec = insertelement <4 x float> undef, float %r4, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmax.ll b/test/CodeGen/AMDGPU/fmax.ll
index d7127f485c74..763040522718 100644
--- a/test/CodeGen/AMDGPU/fmax.ll
+++ b/test/CodeGen/AMDGPU/fmax.ll
@@ -2,16 +2,14 @@
 
 ;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp oge float %r0, %r1
    %r3 = select i1 %r2, float %r0, float %r1
    %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index f78c71b28264..9bbfe1e95c5b 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -4,9 +4,9 @@
 declare double @llvm.maxnum.f64(double, double) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
@@ -14,9 +14,9 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
 define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
-  %a = load double, double addrspace(1)* %aptr, align 8
-  %b = load double, double addrspace(1)* %bptr, align 8
-  %c = load double, double addrspace(1)* %cptr, align 8
+  %a = load volatile double, double addrspace(1)* %aptr, align 8
+  %b = load volatile double, double addrspace(1)* %bptr, align 8
+  %c = load volatile double, double addrspace(1)* %cptr, align 8
   %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
   %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
   store double %f1, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index c3028a6217d5..c0fde6e97f6f 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -11,9 +11,9 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile  float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -29,9 +29,9 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 828243888ac7..da498caa6b54 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; Make sure we don't try to form FMAX_LEGACY nodes with f64
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
 define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -20,7 +20,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
 define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -35,7 +35,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
 define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -50,7 +50,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
 define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index d374fb67350c..4a4c92a38a35 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: Should replace unsafe-fp-math with no signed zeros.
@@ -18,8 +18,8 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp uge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -38,8 +38,8 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp oge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -58,8 +58,8 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ugt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -78,8 +78,8 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -142,8 +142,8 @@ define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 ad
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 3029bd02e4db..a2b33a794d99 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.maxnum.f32(float, float) #0
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
@@ -207,7 +207,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
 ; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_dword [[REG]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
@@ -221,7 +221,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
 ; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_dword [[REG]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
new file mode 100644
index 000000000000..e66678069130
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.minnum.f64(double, double) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float 2.0, float %a)
+  %med = call float @llvm.minnum.f32(float 4.0, float %max)
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call float @llvm.minnum.f32(float 4.0, float %max)
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float %a, float 4.0)
+  %med = call float @llvm.minnum.f32(float %max, float 2.0)
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+  store volatile float %med, float addrspace(1)* %outgep
+  store volatile float %max, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
+; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
+; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
+define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+
+  %max = call double @llvm.maxnum.f64(double %a, double 2.0)
+  %med = call double @llvm.minnum.f64(double %max, double 4.0)
+
+  store double %med, double addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
+; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+
+  ; fmax_legacy
+  %cmp0 = fcmp ule float %a, 2.0
+  %max = select i1 %cmp0, float 2.0, float %a
+
+  ; fmin_legacy
+  %cmp1 = fcmp uge float %max, 4.0
+  %med = select i1 %cmp1, float 4.0, float %max
+
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin.ll b/test/CodeGen/AMDGPU/fmin.ll
index defa8c09638a..d044a7a0542c 100644
--- a/test/CodeGen/AMDGPU/fmin.ll
+++ b/test/CodeGen/AMDGPU/fmin.ll
@@ -2,16 +2,14 @@
 
 ;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
    %r3 = select i1 %r2, float %r1, float %r0
    %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 0a76699b43e1..2d1facfc3a40 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -12,9 +12,9 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -30,9 +30,9 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index e19a48f3f7e2..6982ee0c0cb3 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
 define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
@@ -15,7 +15,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
 define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -30,7 +30,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
 define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -45,7 +45,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
 define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
@@ -60,7 +60,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
 define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 69a0a520a476..79acd02e6d1f 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,16 +1,20 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: Should replace unsafe-fp-math with no signed zeros.
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
-; FUNC-LABEL: @test_fmin_legacy_f32
+; The two inputs to the instruction are different SGPRs from the same
+; super register, so we can't fold both SGPR operands even though they
+; are both the same register.
+
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e64
-; SI-NONAN: v_min_f32_e64
-define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -20,6 +24,23 @@ define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> in
    ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+
+; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
+
+define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: @test_fmin_legacy_ule_f32
 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
@@ -30,8 +51,8 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -49,8 +70,8 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -68,8 +89,8 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp olt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -87,8 +108,8 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ult float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -172,8 +193,8 @@ define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 ad
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val0 = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index 4d7b52540d85..04cb01260bc0 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.minnum.f32(float, float) #0
@@ -206,7 +206,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
 ; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_dword [[REG]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
@@ -220,7 +220,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
 ; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_dword [[REG]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 1ee92b2f7c08..867c5c252b6c 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -36,8 +36,8 @@ define void @multiple_use_fadd_fmac(float addrspace(1)* %out, float %x, float %y
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
-  store float %mul2, float addrspace(1)* %out
-  store float %mad, float addrspace(1)* %out.gep.1
+  store volatile float %mul2, float addrspace(1)* %out
+  store volatile float %mad, float addrspace(1)* %out.gep.1
   ret void
 }
 
@@ -52,8 +52,8 @@ define void @multiple_use_fadd_fmad(float addrspace(1)* %out, float %x, float %y
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
   %mad = fadd fast float %mul2, %y
-  store float %mul2, float addrspace(1)* %out
-  store float %mad, float addrspace(1)* %out.gep.1
+  store volatile float %mul2, float addrspace(1)* %out
+  store volatile float %mad, float addrspace(1)* %out.gep.1
   ret void
 }
 
@@ -66,8 +66,8 @@ define void @multiple_use_fadd_multi_fmad(float addrspace(1)* %out, float %x, fl
   %mul2 = fmul fast float %x.abs, 2.0
   %mad0 = fadd fast float %mul2, %y
   %mad1 = fadd fast float %mul2, %z
-  store float %mad0, float addrspace(1)* %out
-  store float %mad1, float addrspace(1)* %out.gep.1
+  store volatile float %mad0, float addrspace(1)* %out
+  store volatile float %mad1, float addrspace(1)* %out.gep.1
   ret void
 }
 
@@ -80,7 +80,7 @@ define void @fmul_x2_xn2(float addrspace(1)* %out, float %x, float %y) #0 {
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
   %mul = fmul fast float %mul2, %muln2
-  store float %mul, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
   ret void
 }
 
@@ -94,7 +94,7 @@ define void @fmul_x2_xn3(float addrspace(1)* %out, float %x, float %y) #0 {
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
   %mul = fmul fast float %mul2, %muln2
-  store float %mul, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
index addc409c9eb1..9064ad3814d6 100644
--- a/test/CodeGen/AMDGPU/fmul.ll
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -1,12 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-
 ; FUNC-LABEL: {{^}}fmul_f32:
-; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; GCN: v_mul_f32
 
-; SI: v_mul_f32
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
@@ -14,16 +13,16 @@ entry:
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare float @llvm.r600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
 ; FUNC-LABEL: {{^}}fmul_v2f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
@@ -32,15 +31,15 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}fmul_v4f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -51,9 +50,9 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 }
 
 ; FUNC-LABEL: {{^}}test_mul_2_k:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI: s_endpgm
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
+; GCN: s_endpgm
 define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
@@ -62,10 +61,10 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
 }
 
 ; FUNC-LABEL: {{^}}test_mul_2_k_inv:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI-NOT: v_mad_f32
-; SI: s_endpgm
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
+; GCN-NOT: v_mad_f32
+; GCN: s_endpgm
 define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
@@ -76,10 +75,10 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
 ; There should be three multiplies here; %a should be used twice (once
 ; negated), not duplicated into mul x, 5.0 and mul x, -5.0.
 ; FUNC-LABEL: {{^}}test_mul_twouse:
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
 define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
@@ -89,4 +88,4 @@ define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   ret void
 }
 
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
index 600f0cb83578..c80374df4950 100644
--- a/test/CodeGen/AMDGPU/fmuladd.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 
 declare float @llvm.fmuladd.f32(float, float, float)
 declare double @llvm.fmuladd.f64(double, double, double)
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; CHECK-LABEL: {{^}}fmuladd_f32:
@@ -37,13 +37,13 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
 ; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -56,13 +56,13 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
 ; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -77,13 +77,13 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 define void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r0 = load float, float addrspace(1)* %gep.0
-  %r1 = load float, float addrspace(1)* %gep.1
+  %r0 = load volatile float, float addrspace(1)* %gep.0
+  %r1 = load volatile float, float addrspace(1)* %gep.1
 
   %add.0 = fadd float %r0, %r0
   %add.1 = fadd float %add.0, %r1
@@ -99,13 +99,13 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 define void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r0 = load float, float addrspace(1)* %gep.0
-  %r1 = load float, float addrspace(1)* %gep.1
+  %r0 = load volatile float, float addrspace(1)* %gep.0
+  %r1 = load volatile float, float addrspace(1)* %gep.1
 
   %add.0 = fadd float %r0, %r0
   %add.1 = fadd float %r1, %add.0
@@ -119,13 +119,13 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 ; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
 ; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -139,13 +139,13 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 ; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
 ; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r1.fneg = fsub float -0.000000e+00, %r1
 
@@ -161,13 +161,13 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 ; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
 ; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r1.fneg = fsub float -0.000000e+00, %r1
 
@@ -183,13 +183,13 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 ; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %r2.fneg = fsub float -0.000000e+00, %r2
 
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 8830e8273661..b03f318f4571 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; FIXME: Check something here. Currently it seems fabs + fneg aren't
 ; into 2 modifiers, although theoretically that should work.
 
-; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
+; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
 define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -24,8 +24,8 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
 define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -34,7 +34,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y)
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+; GCN-LABEL: {{^}}fneg_fabs_free_f64:
 define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
@@ -43,9 +43,9 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
@@ -54,13 +54,14 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
-; SI: s_load_dwordx2
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
-; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+; GCN-LABEL: {{^}}fneg_fabs_f64:
+; GCN-DAG: s_load_dwordx2
+; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
 define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -68,11 +69,11 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_v2f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN-NOT: 0x80000000
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
@@ -80,13 +81,13 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_v4f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN-NOT: 0x80000000
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
index aa6df209035b..7627a4d32250 100644
--- a/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -39,7 +39,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 ; unless the target returns true for isNegFree()
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
-; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
index 4fac5176fac9..b7ffaed70c5a 100644
--- a/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -1,9 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
 declare float @llvm.fabs.f32(float) #1
 declare double @llvm.fabs.f64(double) #1
 
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 12df6606e8ff..be23e10d7087 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
@@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
   %cast = fptosi double %val to i64
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index 301a94b4904c..b39aeadc8cce 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 declare float @llvm.fabs.f32(float) #0
 
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index 41bc2a780014..760019ebdc08 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
@@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
   %cast = fptoui double %val to i64
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
index 734a43be2296..ad06bdd90a9f 100644
--- a/test/CodeGen/AMDGPU/fpext.ll
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
@@ -18,6 +18,16 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %
   ret void
 }
 
+; FUNC-LABEL: {{^}}fpext_v3f32_to_v3f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+  %result = fpext <3 x float> %in to <3 x double>
+  store <3 x double> %result, <3 x double> addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
new file mode 100644
index 000000000000..68b884363ec5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+
+; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.floor.f64(double) #0
+
+; FUNC-LABEL: {{^}}fract_f64:
+; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e32 [[FLOORX:v\[[0-9]+:[0-9]+\]]], [[X]]
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+  %x = load double, double addrspace(1)* %src
+  %floor.x = call double @llvm.floor.f64(double %x)
+  %fract = fsub double %x, %floor.x
+  store double %fract, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg:
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -[[X]]
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]], -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+  %x = load double, double addrspace(1)* %src
+  %neg.x = fsub double -0.0, %x
+  %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
+  %fract = fsub double %neg.x, %floor.neg.x
+  store double %fract, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg_abs:
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|, -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+  %x = load double, double addrspace(1)* %src
+  %abs.x = call double @llvm.fabs.f64(double %x)
+  %neg.abs.x = fsub double -0.0, %abs.x
+  %floor.neg.abs.x = call double @llvm.floor.f64(double %neg.abs.x)
+  %fract = fsub double %neg.abs.x, %floor.neg.abs.x
+  store double %fract, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}multi_use_floor_fract_f64:
+; VI-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; VI-UNSAFE-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[X]]
+; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
+; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
+; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
+define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+  %x = load double, double addrspace(1)* %src
+  %floor.x = call double @llvm.floor.f64(double %x)
+  %fract = fsub double %x, %floor.x
+  store volatile double %floor.x, double addrspace(1)* %out
+  store volatile double %fract, double addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
new file mode 100644
index 000000000000..7d713f483047
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.floor.f32(float) #0
+
+; GCN-LABEL: {{^}}fract_f32:
+; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+
+; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+  %x = load float, float addrspace(1)* %src
+  %floor.x = call float @llvm.floor.f32(float %x)
+  %fract = fsub float %x, %floor.x
+  store float %fract, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fract_f32_neg:
+; GCN-SAFE: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; GCN-SAFE: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
+
+; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+  %x = load float, float addrspace(1)* %src
+  %x.neg = fsub float -0.0, %x
+  %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
+  %fract = fsub float %x.neg, %floor.x.neg
+  store float %fract, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fract_f32_neg_abs:
+; GCN-SAFE: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; GCN-SAFE: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
+
+; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+  %x = load float, float addrspace(1)* %src
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %neg.abs.x = fsub float -0.0, %abs.x
+  %floor.neg.abs.x = call float @llvm.floor.f32(float %neg.abs.x)
+  %fract = fsub float %neg.abs.x, %floor.neg.abs.x
+  store float %fract, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_use_floor_fract_f32:
+; GCN-UNSAFE-DAG: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; GCN-UNSAFE-DAG: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[INPUT:v[0-9]+]]
+
+; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
+; GCN-UNSAFE: buffer_store_dword [[FRACT]]
+define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+  %x = load float, float addrspace(1)* %src
+  %floor.x = call float @llvm.floor.f32(float %x)
+  %fract = fsub float %x, %floor.x
+  store volatile float %floor.x, float addrspace(1)* %out
+  store volatile float %fract, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index f245ef08cb9d..e0fc263294ab 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -5,11 +5,13 @@
 ; FUNC-LABEL: {{^}}frem_f32:
 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
 ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; GCN-DAG: v_cmp
-; GCN-DAG: v_mul_f32
+; GCN: v_div_scale_f32
+
 ; GCN: v_rcp_f32_e32
+; GCN: v_fma_f32
 ; GCN: v_mul_f32_e32
-; GCN: v_mul_f32_e32
+; GCN: v_div_fmas_f32
+; GCN: v_div_fixup_f32
 ; GCN: v_trunc_f32_e32
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
new file mode 100644
index 000000000000..ce0881c329be
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
+; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %r0 = load double, double addrspace(1)* %in
+  %r1 = call double @llvm.sqrt.f64(double %r0)
+  store double %r1, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
+; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+  %r0 = load double, double addrspace(1)* %in
+  %r1 = call double @llvm.sqrt.f64(double %r0)
+  store double %r1, double addrspace(1)* %out
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double %Val) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index 04101346cdf9..f98cac6ade3a 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,29 +1,143 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
 
 ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
 
-; CHECK: {{^}}fsqrt_f32:
-; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
+; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %r0 = load float, float addrspace(1)* %in
+  %r1 = call float @llvm.sqrt.f32(float %r0)
+  store float %r1, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
+; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+  %r0 = load float, float addrspace(1)* %in
+  %r1 = call float @llvm.sqrt.f32(float %r0)
+  store float %r1, float addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}s_sqrt_f32:
+; GCN: v_sqrt_f32_e32
+
+; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %fdiv = call float @llvm.sqrt.f32(float %in)
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sqrt_v2f32:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sqrt_v4f32:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+entry:
+  %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}elim_redun_check_neg0:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
 
-define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-   %r0 = load float, float addrspace(1)* %in
-   %r1 = call float @llvm.sqrt.f32(float %r0)
-   store float %r1, float addrspace(1)* %out
-   ret void
+; FUNC-LABEL: {{^}}elim_redun_check_pos0:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
 }
 
-; CHECK: {{^}}fsqrt_f64:
-; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}elim_redun_check_ult:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
 
-define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r0 = load double, double addrspace(1)* %in
-   %r1 = call double @llvm.sqrt.f64(double %r0)
-   store double %r1, double addrspace(1)* %out
-   ret void
+; FUNC-LABEL: {{^}}elim_redun_check_v2:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
 }
 
-declare float @llvm.sqrt.f32(float %Val)
-declare double @llvm.sqrt.f64(double %Val)
+; FUNC-LABEL: {{^}}elim_redun_check_v2_ult
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index 38d573258a5e..3429df33c015 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -1,7 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
@@ -24,7 +23,7 @@ define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare float @llvm.r600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index f34a48e30a86..f1b970a4f5fe 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -47,7 +47,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
 }
 
 ; SI-LABEL: {{^}}s_fsub_imm_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
 define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double 4.0, %a
   store double %sub, double addrspace(1)* %out
@@ -55,7 +55,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
 }
 
 ; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
 define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, 4.0
   store double %sub, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 83a8ad8901d2..c4138ad79c28 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -24,11 +24,11 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; CI: v_trunc_f64_e32
 
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_not_b64
+; SI-DAG: s_and_b64
 ; SI-DAG: cmp_gt_i32
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
index edc08609a8aa..1beeab65ade3 100644
--- a/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
 
 declare float @llvm.trunc.f32(float) nounwind readnone
 declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index bc5f031cd4a2..0f2fc836a245 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -12,7 +12,7 @@
 ; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0
 ; NOHSA: .text
-; HSA: .hsatext
+; HSA: .text
 ; GCN: readonly:
 ; GCN: readonly2:
 define void @main(i32 %index, float addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll
deleted file mode 100644
index bd9557d730fb..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i1.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: Evergreen broken
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = zext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = sext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = zext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = sext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = zext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = sext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = zext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = sext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = zext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = sext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = zext <32 x i1> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = sext <32 x i1> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = zext <64 x i1> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = sext <64 x i1> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
-; SI: buffer_store_dwordx2
-define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = zext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
-  %ext = sext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = zext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  %ext = sext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = zext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  %ext = sext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = zext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  %ext = sext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = zext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  %ext = sext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = zext <32 x i1> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-;   %ext = sext <32 x i1> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = zext <64 x i1> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-;   %ext = sext <64 x i1> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
deleted file mode 100644
index 103a40dee270..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
-; SI: buffer_load_ushort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
-; SI: buffer_load_sshort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
-; SI: buffer_load_ushort
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
-; SI: buffer_load_sshort
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
-; SI: buffer_load_ushort v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
-; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
-  %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
-  %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
-  %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = zext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
-  %ext = sext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
deleted file mode 100644
index e5e6be2199c3..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i32.ll
+++ /dev/null
@@ -1,308 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
-; SI: buffer_load_dword v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32, i32 addrspace(1)* %in
-  %ext = zext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32, i32 addrspace(1)* %in
-  %ext = sext i32 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
-  %ext = zext <1 x i32> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
-  %ext = sext <1 x i32> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %ext = zext <2 x i32> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %ext = sext <2 x i32> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %ext = zext <4 x i32> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %ext = sext <4 x i32> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  %ext = zext <8 x i32> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  %ext = sext <8 x i32> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  %ext = sext <16 x i32> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  %ext = zext <16 x i32> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI: s_endpgm
-define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
-  %ext = sext <32 x i32> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI: s_endpgm
-define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
-  %ext = zext <32 x i32> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll
deleted file mode 100644
index b31d5361d5a2..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i8.ll
+++ /dev/null
@@ -1,299 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
-; SI: buffer_load_sbyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = sext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = zext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = sext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = zext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = sext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = zext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = sext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = zext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = sext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = zext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = sext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = zext <32 x i8> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = sext <32 x i8> %load to <32 x i32>
-;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = zext <64 x i8> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = sext <64 x i8> %load to <64 x i32>
-;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-;   ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
-; SI: buffer_load_ubyte v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
-; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8, i8 addrspace(1)* %in
-  %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = zext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
-  %ext = sext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = zext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %ext = sext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = zext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %ext = sext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = zext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  %ext = sext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = zext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  %ext = sext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
-  ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = zext <32 x i8> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-;   %ext = sext <32 x i8> %load to <32 x i64>
-;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = zext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-;   %ext = sext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-;   ret void
-; }
diff --git a/test/CodeGen/AMDGPU/global-variable-relocs.ll b/test/CodeGen/AMDGPU/global-variable-relocs.ll
new file mode 100644
index 000000000000..c39394a3527d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -0,0 +1,203 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
+
+@private = private addrspace(1) global [256 x i32] zeroinitializer
+@internal = internal addrspace(1) global [256 x i32] zeroinitializer
+@available_externally = available_externally addrspace(1) global [256 x i32] zeroinitializer
+@linkonce = linkonce addrspace(1) global [256 x i32] zeroinitializer
+@weak= weak addrspace(1) global [256 x i32] zeroinitializer
+@common = common addrspace(1) global [256 x i32] zeroinitializer
+@extern_weak = extern_weak addrspace(1) global [256 x i32]
+@linkonce_odr = linkonce_odr addrspace(1) global [256 x i32] zeroinitializer
+@weak_odr = weak_odr addrspace(1) global [256 x i32] zeroinitializer
+@external = external addrspace(1) global [256 x i32]
+@external_w_init = addrspace(1) global [256 x i32] zeroinitializer
+
+; CHECK-LABEL: private_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private+8
+; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @private_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: internal_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal+8
+; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @internal_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: available_externally_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @available_externally_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: linkonce_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @linkonce_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: weak_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @weak_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: common_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @common_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: extern_weak_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @extern_weak_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: linkonce_odr_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: weak_odr_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @weak_odr_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: external_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @external_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: external_w_init_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @external_w_init_test(i32 addrspace(1)* %out) {
+  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
+  %val = load i32, i32 addrspace(1)* %ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: .local private
+; CHECK: .local internal
+; CHECK: .weak linkonce
+; CHECK: .weak weak
+; CHECK: .weak linkonce_odr
+; CHECK: .weak weak_odr
+; CHECK-NOT: external{{$}}
+; CHECK: .globl external_w_init
diff --git a/test/CodeGen/AMDGPU/global-zero-initializer.ll b/test/CodeGen/AMDGPU/global-zero-initializer.ll
deleted file mode 100644
index 45aa8bf4e1d7..000000000000
--- a/test/CodeGen/AMDGPU/global-zero-initializer.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported initializer for address space in load_init_global_global
-
-@lds = addrspace(1) global [256 x i32] zeroinitializer
-
-define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(1)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 6786e4a2f375..743ad7c278be 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -1,921 +1,1044 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=VI --check-prefix=FUNC %s
-
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
-; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
+; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
+define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_huge_offset:
+; SI-DAG: v_mov_b32_e32 v[[PTRLO:[0-9]+]], 0xdeac
+; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
+; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_add
+define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
+
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-
 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
-; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
-; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
-; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
-; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
-; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
-; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
-; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
-; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
-; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
-; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
-; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
-; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
-; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
-; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
-; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
-; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
-; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
-; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
-; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
-; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
-; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
-; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
-; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+
+; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
 define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
+
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
-; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
-; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
+; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
+; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %extract0 = extractvalue { i32, i1 } %val, 0
+  store i32 %extract0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+
+; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %extract0 = extractvalue { i32, i1 } %val, 0
+  store i32 %extract0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
+; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+  %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
+; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+entry:
+  %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+  %extract0 = extractvalue { i32, i1 } %val, 0
+  store i32 %extract0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
+; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
+; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %extract0 = extractvalue { i32, i1 } %val, 0
+  store i32 %extract0, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
-; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
-; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
-; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %0, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %val, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_offset:
+; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
+  %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32:
+; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_addr64_offset:
+; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_addr64:
+; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
+  %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_offset:
+; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32:
+; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+entry:
+  store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
+; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
+; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll
new file mode 100644
index 000000000000..2bae66d5aea8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -0,0 +1,1037 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}atomic_add_i64_offset:
+; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
+; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
+; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
+; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64:
+; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret:
+; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64:
+; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
+; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_offset:
+; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
+; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
+; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
+; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64:
+; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret:
+; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64:
+; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
+; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_offset:
+; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
+; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
+; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
+; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64:
+; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret:
+; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
+; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
+; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_offset:
+; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
+; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
+; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
+; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64:
+; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret:
+; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64:
+; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
+; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_offset:
+; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
+; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
+; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
+; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64:
+; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret:
+; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
+; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
+; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_offset:
+; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
+; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
+; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
+; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64:
+; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret:
+; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64:
+; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
+; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_offset:
+; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
+; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
+; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64:
+; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
+; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_offset:
+; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
+; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
+; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
+; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64:
+; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret:
+; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64:
+; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
+; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
+; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
+; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
+; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
+; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64:
+; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
+; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
+; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
+; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_offset:
+; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
+; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
+; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
+; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64:
+; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret:
+; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
+; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
+; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
+  store i64 %tmp0, i64 addrspace(1)* %out2
+  ret void
+}
+
+
+
+
+
+
+
+
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
+; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
+define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
+  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
+; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
+; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+
+; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
+; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+  %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
+; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+entry:
+  %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
+; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
+; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_offset:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
+  %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
+  store i64 %val, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
+  store i64 %val, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_addr64_offset:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8
+  store i64 %val, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_addr64:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
+  %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
+  store i64 %val, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_offset:
+; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+  store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
+define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+entry:
+  store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+  store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+  store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
deleted file mode 100644
index 014b0a5482ab..000000000000
--- a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
-
-; FUNC-LABEL: {{^}}test_i8:
-; EG: CF_END
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
-  %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
-  store i8 %1, i8 addrspace(1)* %out
-  ret void
-}
-
-@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
-
-; FUNC-LABEL: {{^}}test_i16:
-; EG: CF_END
-; SI: buffer_store_short
-; SI: s_endpgm
-define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
-  %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
-  store i16 %1, i16 addrspace(1)* %out
-  ret void
-}
-
-%struct.bar = type { float, [5 x i8] }
-
-; The illegal i8s aren't handled
-@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
-
-; FUNC-LABEL: {{^}}struct_bar_gv_load:
-define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i8, i8 addrspace(2)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* %out, align 1
-  ret void
-}
-
-
-; The private load isn't scalarzied.
-@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
-                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
-                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
-                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
-
-; FUNC-LABEL: {{^}}array_vector_gv_load:
-define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
-  %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
-  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d4d13125cfbf..1f9b536cd80b 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
@@ -10,13 +11,10 @@
 ; FUNC-LABEL: {{^}}float:
 ; GCN: s_load_dword
 
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
+; EG: VTX_READ_32
+; EG: @float_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
 define void @float(float addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
@@ -31,13 +29,10 @@ entry:
 
 ; GCN: s_load_dword
 
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
+; EG: VTX_READ_32
+; EG: @i32_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
 define void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
@@ -54,6 +49,10 @@ entry:
 ; FUNC-LABEL: {{^}}struct_foo_gv_load:
 ; GCN: s_load_dword
 
+; EG: VTX_READ_32
+; EG: @struct_foo_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i32, i32 addrspace(2)* %gep, align 4
@@ -68,6 +67,11 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 
 ; FUNC-LABEL: {{^}}array_v1_gv_load:
 ; GCN: s_load_dword
+
+; EG: VTX_READ_32
+; EG: @array_v1_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
@@ -75,6 +79,11 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   ret void
 }
 
+; FUNC-LABEL: {{^}}gv_addressing_in_branch:
+
+; EG: VTX_READ_32
+; EG: @float_gv
+; EG-NOT: MOVA_INT
 define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll
new file mode 100644
index 000000000000..c75fdb35dd0e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -relocation-model=static < %s | FileCheck %s
+
+@lds = external addrspace(3) global [4 x i32]
+
+; Function Attrs: nounwind
+
+; Offset folding is an optimization done for global variables with relocations,
+; which allows you to store the offset in the r_addend of the relocation entry.
+; The offset is apllied to the variables address at link time, which eliminates
+; the need to emit shader instructions to do this calculation.
+; We don't use relocations for local memory, so we should never fold offsets
+; for local memory globals.
+
+; CHECK-LABEL: lds_no_offset:
+; CHECK ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
+define void @lds_no_offset() {
+entry:
+  %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
+  store i32 0, i32 addrspace(3)* %ptr
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index a02cbf43c400..d21d66176a14 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; half args should be promoted to float
@@ -13,10 +13,11 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
 }
 
 ; GCN-LABEL: {{^}}load_v2f16_arg:
-; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
+; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
 define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
@@ -42,10 +43,7 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
 define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
@@ -280,11 +278,11 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
-; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
 define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
@@ -318,22 +316,8 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
 
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
@@ -378,10 +362,10 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
 }
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
-; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
@@ -396,22 +380,18 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
 
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32
-; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]]
-; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN-NOT: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
 
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: v_cvt_f64_f32_e32
 ; GCN-NOT: v_cvt_f64_f32_e32
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN: s_endpgm
 define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
@@ -459,8 +439,9 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: buffer_store_short [[CVT0]]
-; GCN-DAG: buffer_store_short [[CVT1]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
+; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
+; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
 define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %val = load <2 x float>, <2 x float> addrspace(1)* %in
@@ -491,10 +472,7 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
 define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -514,14 +492,7 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
   %val = load <8 x float>, <8 x float> addrspace(1)* %in
@@ -551,22 +522,8 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
 ; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN-DAG: v_cvt_f16_f32_e32
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_dwordx4
+; GCN-DAG: buffer_store_dwordx4
 ; GCN: s_endpgm
 define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/hsa-default-device.ll b/test/CodeGen/AMDGPU/hsa-default-device.ll
new file mode 100644
index 000000000000..631d6def4442
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; Make sure that with an HSA triple, we don't default to an
+; unsupported device.
+
+; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+  store float 0.0, float addrspace(1)* %out0
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
new file mode 100644
index 000000000000..36aa6779d382
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_default_ci:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_default_vi:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_f64_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_f32_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 48
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_f32_f64_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 240
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 0
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="kaveri" }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
new file mode 100644
index 000000000000..28c8b5d73b02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo  | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+
+; The SHT_NOTE section contains the output from the .hsa_code_object_*
+; directives.
+
+; ELF: Section {
+; ELF: Name: .text
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x6)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_EXECINSTR (0x4)
+; ELF: }
+
+; ELF: SHT_NOTE
+; ELF: 0000: 04000000 08000000 01000000 414D4400
+; ELF: 0010: 02000000 01000000 04000000 1B000000
+
+; ELF: 0020: 03000000 414D4400 04000700 07000000
+; ELF: 0030: 00000000 00000000 414D4400 414D4447
+; ELF: 0040: 50550000
+
+; ELF: Symbol {
+; ELF: Name: simple
+; ELF: Size: 288
+; ELF: Type: Function (0x2)
+; ELF: }
+
+; HSA: .hsa_code_object_version 2,1
+; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
+
+; HSA: .text
+
+; HSA-NOT: .amdgpu_hsa_kernel simple
+; HSA: {{^}}simple:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: .end_amd_kernel_code_t
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+
+; Make sure we are setting the ATC bit:
+; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; On VI+ we also need to set MTYPE = 2
+; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
+; Make sure we generate flat store for HSA
+; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+
+; HSA: .Lfunc_end0:
+; HSA: .size   simple, .Lfunc_end0-simple
+
+define void @simple(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index 90322ac3dc01..df478fbcf3b5 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -1,14 +1,11 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF
 
+@linkonce_odr_global_program = linkonce_odr addrspace(1) global i32 0
+@linkonce_global_program = linkonce addrspace(1) global i32 0
 @internal_global_program = internal addrspace(1) global i32 0
 @common_global_program = common addrspace(1) global i32 0
 @external_global_program = addrspace(1) global i32 0
 
-@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent"
-@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
-@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
-
 @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
 @external_readonly = unnamed_addr addrspace(2) constant i32 0
 
@@ -16,133 +13,38 @@ define void @test() {
   ret void
 }
 
-; ASM: .amdgpu_hsa_module_global internal_global
-; ASM: .size internal_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: internal_global_program:
-; ASM: .long 0
-
-; ASM: .amdgpu_hsa_module_global common_global
-; ASM: .size common_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: common_global_program:
+; ASM: .type linkonce_odr_global_program,@object
+; ASM: .section .bss,#alloc,#write
+; ASM: .weak linkonce_odr_global_program
+; ASM: linkonce_odr_global_program:
 ; ASM: .long 0
+; ASM: .size linkonce_odr_global_program, 4
 
-; ASM: .amdgpu_hsa_program_global external_global
-; ASM: .size external_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: external_global_program:
+; ASM: .type linkonce_global_program,@object
+; ASM: .weak linkonce_global_program
+; ASM: linkonce_global_program:
 ; ASM: .long 0
+; ASM: .size linkonce_global_program, 4
 
-; ASM: .amdgpu_hsa_module_global internal_global
-; ASM: .size internal_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: internal_global_agent:
-; ASM: .long 0
+; ASM: .type internal_global_program,@object
+; ASM: .local internal_global_program
+; ASM: .comm internal_global_program,4,2
 
-; ASM: .amdgpu_hsa_module_global common_global
-; ASM: .size common_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: common_global_agent:
-; ASM: .long 0
+; ASM: .type common_global_program,@object
+; ASM: .comm common_global_program,4,2
 
-; ASM: .amdgpu_hsa_program_global external_global
-; ASM: .size external_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: external_global_agent:
+; ASM: external_global_program:
 ; ASM: .long 0
+; ASM: .size external_global_program, 4
 
-; ASM: .amdgpu_hsa_module_global internal_readonly
-; ASM: .size internal_readonly, 4
-; ASM: .hsatext
+; ASM: .type internal_readonly,@object
+; ASM: .text
 ; ASM: internal_readonly:
 ; ASM: .long 0
+; ASM: .size internal_readonly, 4
 
-; ASM: .amdgpu_hsa_program_global external_readonly
-; ASM: .size external_readonly, 4
-; ASM: .hsatext
+; ASM: .type external_readonly,@object
+; ASM: .globl external_readonly
 ; ASM: external_readonly:
 ; ASM: .long 0
-
-; ELF: Section {
-; ELF: Name: .hsadata_global_program
-; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0x100003)
-; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
-; ELF: SHF_WRITE (0x1)
-; ELF: ]
-; ELF: }
-
-; ELF: Section {
-; ELF: Name: .hsadata_global_agent
-; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0x900003)
-; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
-; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
-; ELF: SHF_WRITE (0x1)
-; ELF: ]
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: common_global_agent
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: common_global_program
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_global_agent
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_global_program
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_readonly
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsatext
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_global_agent
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_global_program
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_readonly
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsatext
-; ELF: }
+; ASM: .size external_readonly, 4
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index f82e98e79545..1b4a0f3090b8 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s
 
-; HSA: .hsa_code_object_version 1,0
+; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 ; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index c089dfd9a971..82d7da188ca3 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -9,34 +9,31 @@
 ; directives.
 
 ; ELF: Section {
-; ELF: Name: .hsatext
+; ELF: Name: .text
 ; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0xC00007)
+; ELF: Flags [ (0x6)
 ; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
-; ELF: SHF_AMDGPU_HSA_CODE (0x400000)
 ; ELF: SHF_EXECINSTR (0x4)
-; ELF: SHF_WRITE (0x1)
 ; ELF: }
 
 ; ELF: SHT_NOTE
 ; ELF: 0000: 04000000 08000000 01000000 414D4400
-; ELF: 0010: 01000000 00000000 04000000 1B000000
+; ELF: 0010: 02000000 01000000 04000000 1B000000
 ; ELF: 0020: 03000000 414D4400 04000700 07000000
 ; ELF: 0030: 00000000 00000000 414D4400 414D4447
 ; ELF: 0040: 50550000
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 296
+; ELF: Size: 288
 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
 ; ELF: }
 
-; HSA: .hsa_code_object_version 1,0
+; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .hsatext
+; HSA: .text
 
 ; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
@@ -51,12 +48,12 @@
 ; On VI+ we also need to set MTYPE = 2
 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
 ; Make sure we generate flat store for HSA
-; HSA: flat_store_dword v{{[0-9]+}}
+; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 
 ; HSA: .Lfunc_end0:
 ; HSA: .size   simple, .Lfunc_end0-simple
 
-define void @simple(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index b11a21137642..d6309a2dd5de 100644
--- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -4,9 +4,8 @@
 ; SILowerI1Copies was not handling IMPLICIT_DEF
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
-; SI-NEXT: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
-; SI-NEXT: BB#1:
+; SI-NEXT: s_and_b64 vcc, exec
+; SI-NEXT: s_cbranch_vccnz
 define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 105cd06b330a..4d50dc2f4023 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,9 +10,11 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+define void @br_i1_phi(i32 %arg) {
 bb:
-  br i1 %arg1, label %bb2, label %bb3
+  %tidig = call i32 @llvm.r600.read.tidig.x() #0
+  %cmp = trunc i32 %tidig to i1
+  br i1 %cmp, label %bb2, label %bb3
 
 bb2:                                              ; preds = %bb
   br label %bb3
@@ -22,9 +24,14 @@ bb3:                                              ; preds = %bb2, %bb
   br i1 %tmp, label %bb4, label %bb6
 
 bb4:                                              ; preds = %bb3
-  %tmp5 = mul i32 undef, %arg
+  %val = load volatile i32, i32 addrspace(1)* undef
+  %tmp5 = mul i32 %val, %arg
   br label %bb6
 
 bb6:                                              ; preds = %bb4, %bb3
   ret void
 }
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index 8db9ea4ccf31..674eceee8122 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -23,7 +23,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
+; CHECK-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
@@ -31,7 +31,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
 }
 
 ; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; CHECK: buffer_store_dword [[REG]]
 define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
@@ -322,7 +322,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
@@ -333,7 +333,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
@@ -344,7 +344,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
@@ -355,7 +355,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
@@ -366,7 +366,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
@@ -377,7 +377,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
@@ -388,7 +388,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
@@ -399,7 +399,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
@@ -410,7 +410,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
@@ -422,7 +422,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
@@ -433,7 +433,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
@@ -444,7 +444,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
@@ -455,7 +455,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
@@ -466,7 +466,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
@@ -477,7 +477,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
@@ -488,7 +488,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
@@ -499,7 +499,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
 ; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
@@ -510,7 +510,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
 
 ; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
 ; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
@@ -520,7 +520,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 
 ; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
+; CHECK-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e40cac22725c..66cec88e760c 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
@@ -87,13 +87,34 @@ entry:
 ; CHECK: s_cbranch_execnz
 define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
 
+; CHECK-LABEL: {{^}}extract_undef_offset_sgpr:
+define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %value = extractelement <4 x i32> %ld, i32 undef
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK-NEXT: v_movreld_b32
+define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
+  %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}insert_w_offset:
 ; CHECK: s_mov_b32 m0
 ; CHECK-NEXT: v_movreld_b32_e32
@@ -152,7 +173,7 @@ entry:
 ; CHECK: s_cbranch_execnz
 define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
@@ -167,12 +188,304 @@ entry:
 ; CHECK: s_cbranch_execnz
 define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
-  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
   store <4 x i32> %value, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
+; When the block is split to insert the loop, make sure any other
+; places that need to be expanded in the same block are also handled.
+
+; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
+
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
+; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; CHECK: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dword [[MOVREL0]]
+; CHECK: buffer_store_dword [[MOVREL1]]
+define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+  %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={SGPR4}" ()
+  %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+  store volatile i32 %val0, i32 addrspace(1)* %out0
+  store volatile i32 %val1, i32 addrspace(1)* %out0
+  %cmp = icmp eq i32 %id, 0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 %live.out.reg, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
+; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+; CHECK-DAG: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
+
+; CHECK: buffer_store_dword [[INS0]]
+define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+  %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
+  %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+  store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+  %cmp = icmp eq i32 %id, 0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 %live.out.val, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @extract_adjacent_blocks(i32 %arg) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+  br label %bb7
+
+bb4:
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+  br label %bb7
+
+bb7:
+  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile float %tmp8, float addrspace(1)* undef
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+  br label %bb7
+
+bb4:                                              ; preds = %bb
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+  br label %bb7
+
+bb7:                                              ; preds = %bb4, %bb1
+  %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Should be able to fold zero input to movreld to inline imm?
+
+; CHECK-LABEL: {{^}}multi_same_block:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: s_add_i32 m0, [[ARG]], -16
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]]
+
+; CHECK: s_add_i32 m0, [[ARG]], -14
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK: s_mov_b32 m0, -1
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
+; CHECK: s_endpgm
+define void @multi_same_block(i32 %arg) #0 {
+bb:
+  %tmp1 = add i32 %arg, -16
+  %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 0.000000e+00, i32 %tmp1
+  %tmp3 = add i32 %arg, -16
+  %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float 0x3FB99999A0000000, i32 %tmp3
+  %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
+  %tmp6 = extractelement <6 x i32> %tmp5, i32 1
+  %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
+  %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+  store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
+  store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+  ret void
+}
+
+; offset puts outside of superegister bounaries, so clamp to 1st element.
+; CHECK-LABEL: {{^}}extract_largest_inbounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_mov_b32 m0, [[IDX]]
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %offset = add i32 %idx, 3
+  %value = extractelement <4 x i32> %ld, i32 %offset
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABL: {{^}}extract_out_of_bounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_add_i32 m0, [[IDX]], 4
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %offset = add i32 %idx, 4
+  %value = extractelement <4 x i32> %ld, i32 %offset
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test that the or is folded into the base address register instead of
+; added to m0
+
+; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+entry:
+  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %idx.shl = shl i32 %idx.in, 2
+  %idx = or i32 %idx.shl, 1
+  %value = extractelement <4 x i32> %ld, i32 %idx
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+  %idx.shl = shl i32 %idx.in, 2
+  %idx = or i32 %idx.shl, 1
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-undef.mir b/test/CodeGen/AMDGPU/indirect-addressing-undef.mir
new file mode 100644
index 000000000000..7cd35d41f30c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-undef.mir
@@ -0,0 +1,327 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-lower-control-flow -o - %s | FileCheck %s
+# Getting an undef that is specifically a VGPR is tricky from IR
+
+# CHECK-LABEL: name: extract_undef_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+
+--- |
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  define void @extract_undef_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  entry:
+    %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+    %value = extractelement <4 x i32> %ld, i32 undef
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @extract_undef_neg_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  entry:
+    %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+    %value = extractelement <4 x i32> %ld, i32 undef
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @insert_undef_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  entry:
+    %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+    %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+    store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+    ret void
+  }
+
+  define void @insert_undef_neg_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  entry:
+    %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+    %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+    store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+    ret void
+  }
+
+  define void @insert_undef_value_offset_vgpr(<4 x i32> addrspace(1)*%out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+  entry:
+    %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+    %value = insertelement <4 x i32> %ld, i32 undef, i32 %idx
+    store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+    ret void
+  }
+
+...
+---
+name:            extract_undef_offset_vgpr
+alignment:       0
+exposesReturnsTwice: false
+hasInlineAsm:    false
+allVRegsAllocated: true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    S_WAITCNT 127
+    %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    S_WAITCNT 3952
+    %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+    S_WAITCNT 127
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc
+# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1
+
+name:            extract_undef_neg_offset_vgpr
+alignment:       0
+exposesReturnsTwice: false
+hasInlineAsm:    false
+allVRegsAllocated: true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    S_WAITCNT 127
+    %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+    S_WAITCNT 3952
+    %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+    S_WAITCNT 127
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: insert_undef_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1
+
+name:            insert_undef_offset_vgpr
+alignment:       0
+exposesReturnsTwice: false
+hasInlineAsm:    false
+allVRegsAllocated: true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr4 = V_MOV_B32_e32 5, implicit %exec
+    S_WAITCNT 127
+    %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    S_WAITCNT 3952
+    %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+    S_WAITCNT 127
+    BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+    S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+name:            insert_undef_neg_offset_vgpr
+alignment:       0
+exposesReturnsTwice: false
+hasInlineAsm:    false
+allVRegsAllocated: true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr4 = V_MOV_B32_e32 5, implicit %exec
+    S_WAITCNT 127
+    %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    S_WAITCNT 3952
+    %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+    S_WAITCNT 127
+    BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+    S_ENDPGM
+
+...
+
+# CHECK-LABEL: insert_undef_value_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 undef %vgpr10, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+name:            insert_undef_value_offset_vgpr
+alignment:       0
+exposesReturnsTwice: false
+hasInlineAsm:    false
+allVRegsAllocated: true
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0_sgpr1
+
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 -1
+    %vgpr4 = V_MOV_B32_e32 2, implicit %exec
+    S_WAITCNT 127
+    %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+    S_WAITCNT 3952
+    %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, killed %vgpr4, 0, undef %vgpr10, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+    S_WAITCNT 127
+    BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index 2a3b29f54fa9..1f851f9de535 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,24 +1,31 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
 
-
-declare void @llvm.AMDGPU.barrier.local() convergent nounwind
+declare void @llvm.amdgcn.s.barrier() #0
 
 ; SI-LABEL: {{^}}private_access_f64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
+define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
-  %array = alloca double, i32 16, align 8
-  %ptr = getelementptr double, double* %array, i32 %b
+  %array = alloca [16 x double], align 8
+  %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
   store double %val, double* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() convergent nounwind
+  call void @llvm.amdgcn.s.barrier()
   %result = load double, double* %ptr, align 8
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -26,19 +33,30 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 
 ; SI-LABEL: {{^}}private_access_v2f64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
+define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
-  %array = alloca <2 x double>, i32 16, align 16
-  %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
+  %array = alloca [8 x <2 x double>], align 16
+  %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() convergent nounwind
+  call void @llvm.amdgcn.s.barrier()
   %result = load <2 x double>, <2 x double>* %ptr, align 16
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -46,17 +64,25 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 
 ; SI-LABEL: {{^}}private_access_i64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
+define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
-  %array = alloca i64, i32 16, align 8
-  %ptr = getelementptr i64, i64* %array, i32 %b
+  %array = alloca [8 x i64], align 8
+  %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
   store i64 %val, i64* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() convergent nounwind
+  call void @llvm.amdgcn.s.barrier()
   %result = load i64, i64* %ptr, align 8
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -64,20 +90,35 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 
 ; SI-LABEL: {{^}}private_access_v2i64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
+define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %array = alloca <2 x i64>, i32 16, align 16
-  %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
+  %array = alloca [8 x <2 x i64>], align 16
+  %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() convergent nounwind
+  call void @llvm.amdgcn.s.barrier()
   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index 9c8d3534f8ad..1f5b8be15e2e 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK: {{^}}inline_asm:
+; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
 define void @inline_asm(i32 addrspace(1)* %out) {
@@ -11,13 +11,175 @@ entry:
   ret void
 }
 
-; CHECK: {{^}}inline_asm_shader:
+; CHECK-LABEL: {{^}}inline_asm_shader:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define void @inline_asm_shader() #0 {
+define amdgpu_ps void @inline_asm_shader() {
 entry:
   call void asm sideeffect "s_endpgm", ""()
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" }
+
+; CHECK: {{^}}branch_on_asm:
+; Make sure inline assembly is treted as divergent.
+; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK: s_and_saveexec_b64
+define void @branch_on_asm(i32 addrspace(1)* %out) {
+	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
+	%cmp = icmp eq i32 %zero, 0
+	br i1 %cmp, label %if, label %endif
+
+if:
+	store i32 0, i32 addrspace(1)* %out
+	br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_cmp_asm:
+; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_cmp_ne_i32_e64 s{{\[}}[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]{{\]}}, 0, [[SRC]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+  %sgpr = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %in)
+  store i64 %sgpr, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm:
+; CHECK: codeLenInByte = 12
+define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "v_nop_e64", ""()
+  ret void
+}
+
+; All inlineasm instructions are assumed to be the maximum size
+; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
+; CHECK: codeLenInByte = 12
+define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "v_nop_e32", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "
+    v_nop_e64
+    v_nop_e64
+   ", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "
+    v_nop_e64
+
+    v_nop_e64
+   ", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; comment", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "
+; comment", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; comment
+", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; first comment ; second comment", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; first comment;second comment", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; comment
+    v_nop_e64 ; inline comment
+; separate comment
+    v_nop_e64
+
+    ; trailing comment
+    ; extra comment
+  ", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "v_nop_e64 ; inline comment
+; separate comment
+    v_nop_e64
+
+    ; trailing comment
+    ; extra comment
+  ", ""()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+entry:
+  call void asm sideeffect "; comment
+    v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
+; separate comment
+    v_bfrev_b32_e32 v0, 1
+
+    ; trailing comment
+    ; extra comment
+  ", ""()
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/input-mods.ll b/test/CodeGen/AMDGPU/input-mods.ll
index 1c4d285cbcb1..720790df7e16 100644
--- a/test/CodeGen/AMDGPU/input-mods.ll
+++ b/test/CodeGen/AMDGPU/input-mods.ll
@@ -9,18 +9,16 @@
 ;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
 ;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.fabs.f32(float %r0)
    %r2 = fsub float -0.000000e+00, %r1
    %r3 = call float @llvm.exp2.f32(float %r2)
    %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.exp2.f32(float) readnone
 declare float @llvm.fabs.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7f9579e59782..367e7f734556 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -9,168 +9,296 @@
 ; FIXME: Why is the constant moved into the intermediate register and
 ; not just directly into the vector component?
 
-; SI-LABEL: {{^}}insertelement_v4f32_0:
-; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
-; v_mov_b32_e32
-; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
-; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
-; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
+; GCN-LABEL: {{^}}insertelement_v4f32_0:
+; GCN: s_load_dwordx4
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}insertelement_v4f32_1:
+; GCN-LABEL: {{^}}insertelement_v4f32_1:
 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}insertelement_v4f32_2:
+; GCN-LABEL: {{^}}insertelement_v4f32_2:
 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}insertelement_v4f32_3:
+; GCN-LABEL: {{^}}insertelement_v4f32_3:
 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}insertelement_v4i32_0:
+; GCN-LABEL: {{^}}insertelement_v4i32_0:
 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-LABEL: {{^}}insertelement_v3f32_1:
+define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
+  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v3f32_2:
+define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
+  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v3f32_3:
+define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
+  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_to_sgpr:
+; GCN-NOT: v_readfirstlane
+define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
+  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
+  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
+  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tmp2
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-DAG: buffer_store_dword v
+define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
+  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
-; SI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx2
 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 5
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-DAG: buffer_store_dword v
+define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
+  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
   ret void
 }
 
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
-; FIXMESI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
+define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
+  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i16:
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:6
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
-  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
+  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
-; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
-; FIXMESI: BUFFER_STORE_USHORT
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_short v{{[0-9]+}}, off
 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
-; FIXMESI: buffer_store_dword
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
+define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
+  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:3
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off
 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
-  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
+  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
-; FIXMESI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
-  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
+  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
@@ -179,7 +307,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
 
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
-; SI-LABEL: {{^}}insert_split_bb:
+; GCN-LABEL: {{^}}insert_split_bb:
 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -203,30 +331,30 @@ endif:
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
-; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
-; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
-; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
+; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
+; GCN: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
+; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
 
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 
-; SI: s_mov_b32 m0, [[SCALEDIDX]]
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+; GCN: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
 
 ; Increment to next element.
 ; FIXME: Should be able to manipulate m0 directly instead of add and
 ; copy.
 
-; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
-; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
-; SI-DAG: s_mov_b32 m0, [[IDX1]]
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+; FIXME: Should avoid resetting m0 to same value
+; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
 
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
@@ -234,44 +362,52 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
 }
 
 ; FIXME: Inline immediate should be folded into v_movreld_b32.
-; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
 
-; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
-; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
 
-; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
-; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
 
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
   ret void
 }
 
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
+define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
+  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
+  ret void
+}
+
 ; FIXME: Should be able to do without stack access. The used stack
 ; space is also 2x what should be required.
 
-; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
-; SI: SCRATCH_RSRC_DWORD
+; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
+; GCN: SCRATCH_RSRC_DWORD
 
 ; Stack store
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 
 ; Write element
-; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
 ; Store result
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; SI: ScratchSize: 64
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
+; GCN: ScratchSize: 64
 
 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
@@ -279,29 +415,31 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
   ret void
 }
 
-; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
-; SI: SCRATCH_RSRC_DWORD
+; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
+; GCN: SCRATCH_RSRC_DWORD
 
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
 
-; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; SI: ScratchSize: 128
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
+; GCN: ScratchSize: 128
 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
+
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
new file mode 100644
index 000000000000..c29434f5eca2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -0,0 +1,8 @@
+; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
+define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
+  store volatile i32 0, i32 addrspace(1)* %stof
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 2a01a621fc42..347170f79e32 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -7,8 +7,8 @@
 ; from constant/invariant memory.
 
 ; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load:
-; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
 define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
@@ -21,7 +21,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add
 ; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load:
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
-; GCN: buffer_store_dword [[K]], s{{\[}}[[SPTR_LO]]:
+; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
 define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
@@ -32,4 +32,4 @@ define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 a
 
 !0 = !{}
 
-attributes #0 = { nounwind }
-\ No newline at end of file
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/jump-address.ll b/test/CodeGen/AMDGPU/jump-address.ll
index f55912e37401..9fde31f922cd 100644
--- a/test/CodeGen/AMDGPU/jump-address.ll
+++ b/test/CodeGen/AMDGPU/jump-address.ll
@@ -4,7 +4,7 @@
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
-define void @main() #0 {
+define amdgpu_ps void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
@@ -36,7 +36,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
   %18 = insertelement <4 x float> %17, float %temp2.0, i32 2
   %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %19, i32 0, i32 0)
   ret void
 
 IF13:                                             ; preds = %ELSE
@@ -47,6 +47,4 @@ IF13:                                             ; preds = %ELSE
   br label %ENDIF
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
index 7e2291cfdc35..43448fbd7b33 100644
--- a/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -36,15 +36,15 @@ main_body:
   %29 = extractelement <4 x float> %28, i32 3
   %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
   %36 = insertelement <4 x float> undef, float %32, i32 0
   %37 = insertelement <4 x float> %36, float %33, i32 1
   %38 = insertelement <4 x float> %37, float %34, i32 2
   %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
   ret void
 }
 
@@ -84,17 +84,17 @@ main_body:
   %29 = extractelement <4 x float> %28, i32 2
   %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
   %36 = insertelement <4 x float> undef, float %32, i32 0
   %37 = insertelement <4 x float> %36, float %33, i32 1
   %38 = insertelement <4 x float> %37, float %34, i32 2
   %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.AMDIL.clamp.(float, float, float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
new file mode 100644
index 000000000000..21c92dbc9098
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -0,0 +1,44 @@
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Test that the alignment of kernel arguments does not impact the
+; alignment of the stack
+
+; CHECK-LABEL: {{^}}no_args:
+; CHECK: ScratchSize: 8{{$}}
+define void @no_args() {
+  %alloca = alloca i8
+  store volatile i8 0, i8* %alloca
+  ret void
+}
+
+; CHECK-LABEL: {{^}}force_align32:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align32(<8 x i32>) {
+  %alloca = alloca i8
+  store volatile i8 0, i8* %alloca
+  ret void
+}
+
+; CHECK-LABEL: {{^}}force_align64:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align64(<16 x i32>) {
+  %alloca = alloca i8
+  store volatile i8 0, i8* %alloca
+  ret void
+}
+
+; CHECK-LABEL: {{^}}force_align128:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align128(<32 x i32>) {
+  %alloca = alloca i8
+  store volatile i8 0, i8* %alloca
+  ret void
+}
+
+; CHECK-LABEL: {{^}}force_align256:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align256(<64 x i32>) {
+  %alloca = alloca i8
+  store volatile i8 0, i8* %alloca
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index e9d98ac89e72..7567b38e0cea 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
@@ -475,3 +475,55 @@ entry:
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
+
+; FUNC-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 84380b421051..099f0639b34c 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
@@ -7,17 +7,19 @@
 
 ; ALL-LABEL: {{^}}large_alloca_compute_shader:
 
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
+; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000
+; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
 
 
 ; GCNHSA: .amd_kernel_code_t
 
 ; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
-; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 8
 ; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
 ; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
 ; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
@@ -29,7 +31,7 @@
 ; GCNHSA: enable_sgpr_queue_ptr = 0
 ; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
 ; GCNHSA: enable_sgpr_dispatch_id = 0
-; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 1
 ; GCNHSA: enable_sgpr_private_segment_size = 0
 ; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
 ; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
@@ -39,8 +41,8 @@
 ; GCNHSA: .end_amd_kernel_code_t
 
 
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index b6f8093313cb..fb0e15eb0cb9 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -2,17 +2,17 @@
 ; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s10, -1
+; CI-DAG: s_mov_b32 s11, 0xe8f000
+; VI-DAG: s_mov_b32 s11, 0xe80000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
 
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
@@ -23,17 +23,17 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
 }
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s10, -1
+; CI-DAG: s_mov_b32 s11, 0xe8f000
+; VI-DAG: s_mov_b32 s11, 0xe80000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
 
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
@@ -44,4 +44,3 @@ define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
 }
 
 attributes #0 = { nounwind  }
-attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
new file mode 100644
index 000000000000..f661939214c0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -0,0 +1,117 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK: @occupancy_0(
+; CHECK: alloca [5 x i32]
+define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK: @occupancy_max(
+; CHECK: alloca [5 x i32]
+define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }
+attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }
+attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }
+attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }
diff --git a/test/CodeGen/AMDGPU/large-work-group-registers.ll b/test/CodeGen/AMDGPU/large-work-group-registers.ll
new file mode 100644
index 000000000000..468633da56d8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-work-group-registers.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -post-RA-scheduler=0 < %s | FileCheck %s
+
+; CHECK: NumVgprs: 64
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
+main_body:
+  %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
+  %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
+  %10 = extractelement <3 x i32> %7, i32 0
+  %11 = extractelement <3 x i32> %7, i32 1
+  %12 = mul i32 %10, %11
+  %bc = bitcast <3 x i32> %7 to <3 x float>
+  %13 = extractelement <3 x float> %bc, i32 1
+  %14 = insertelement <512 x float> undef, float %13, i32 %12
+  call void @llvm.amdgcn.s.barrier()
+  %15 = extractelement <3 x i32> %6, i32 0
+  %16 = extractelement <3 x i32> %7, i32 0
+  %17 = shl i32 %15, 5
+  %18 = add i32 %17, %16
+  %19 = shl i32 %18, 4
+  %20 = extractelement <3 x i32> %7, i32 1
+  %21 = shl i32 %20, 2
+  %22 = sext i32 %21 to i64
+  %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
+  %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
+  %25 = load i32, i32 addrspace(3)* %24, align 4
+  %26 = extractelement <512 x float> %14, i32 %25
+  %27 = insertelement <4 x float> undef, float %26, i32 0
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+
+attributes #0 = { "amdgpu-max-work-group-size"="1024" }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/lds-alignment.ll b/test/CodeGen/AMDGPU/lds-alignment.ll
new file mode 100644
index 000000000000..99334585e589
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -0,0 +1,268 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s
+
+@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
+@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
+
+@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
+@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
+
+@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
+@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
+
+declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0
+
+
+; HSA-LABEL: {{^}}test_no_round_size_1:
+; HSA: workgroup_group_segment_byte_size = 38
+define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+  ret void
+}
+
+; There are two objects, so one requires padding to to be correctly
+; aligned after the other.
+
+; (38 -> 48) + 38 = 92
+
+; I don't think it is necessary to add padding after since if there
+; were to be a dynamically sized LDS kernel arg, the runtime should
+; add the alignment padding if necessary alignment padding if needed.
+
+; HSA-LABEL: {{^}}test_round_size_2:
+; HSA: workgroup_group_segment_byte_size = 86
+; HSA: group_segment_alignment = 4
+define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+
+  %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false)
+
+  ret void
+}
+
+; 38 + (10 pad) + 38
+; HSA-LABEL: {{^}}test_round_size_2_align_8:
+; HSA: workgroup_group_segment_byte_size = 86
+; HSA: group_segment_alignment = 4
+define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
+; HSA: workgroup_group_segment_byte_size = 38
+; HSA: group_segment_alignment = 4
+define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
+  ret void
+}
+
+; HSA-LABEL: {{^}}test_round_lds_arg:
+; HSA: workgroup_group_segment_byte_size = 0
+; HSA: group_segment_alignment = 4
+define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
+  ret void
+}
+
+; FIXME: Parameter alignment not considered
+; HSA-LABEL: {{^}}test_high_align_lds_arg:
+; HSA: workgroup_group_segment_byte_size = 0
+; HSA: group_segment_alignment = 4
+define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
+  ret void
+}
+
+; (7 * 8) + (39 * 4) = 212
+; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
+; HSA: workgroup_group_segment_byte_size = 212
+; HSA: group_segment_alignment = 4
+define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
+
+  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
+
+  ret void
+}
+
+; (39 * 4) + (4 pad) + (7 * 8) = 216
+; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
+; HSA: workgroup_group_segment_byte_size = 216
+; HSA: group_segment_alignment = 4
+define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
+
+  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
+
+  ret void
+}
+; Test how the size needed for padding changes based on when the
+; global is encountered during lowering. There should be a consistent
+; order to minimize padding waste.
+;
+; The way global addresses are lowered now, this is in inverse of
+; first use order which isn't great.
+;
+; This should be the optimal order for these globals. If sorted to
+; minimize padding, the minimum possible size is: align 32, align 8,
+; align 16
+
+
+; align 32, 16, 8
+; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
+; HSA-LABEL: {{^}}test_round_size_3_order0:
+; HSA: workgroup_group_segment_byte_size = 134
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; align 32, 8, 16
+; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
+; HSA-LABEL: {{^}}test_round_size_3_order1:
+; HSA: workgroup_group_segment_byte_size = 134
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; align 16, 32, 8
+; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
+; HSA-LABEL: {{^}}test_round_size_3_order2:
+; HSA: workgroup_group_segment_byte_size = 150
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; align 16, 8, 32
+; 38 + (2 pad) + 38 + (2 pad) + 38
+; HSA-LABEL: {{^}}test_round_size_3_order3:
+; HSA: workgroup_group_segment_byte_size = 118
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; align 8, 32, 16
+; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
+; HSA-LABEL: {{^}}test_round_size_3_order4:
+; HSA: workgroup_group_segment_byte_size = 142
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+; align 8, 16, 32
+; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
+; HSA-LABEL: {{^}}test_round_size_3_order5:
+; HSA: workgroup_group_segment_byte_size = 126
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
index bf8df63be9fd..9875814b03d3 100644
--- a/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported initializer for address space in load_init_lds_global
+; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
new file mode 100644
index 000000000000..0c734c6d99dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure that m0 is not reinitialized in the loop.
+
+; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
+; GCN: s_cbranch_scc1 BB0_3
+
+; Initialize in preheader
+; GCN: s_mov_b32 m0, -1
+
+; GCN: BB0_2:
+; GCN: ds_read_b32
+; GCN: buffer_store_dword
+
+; GCN: s_cbranch_vccz BB0_2
+
+; GCN: BB0_3:
+; GCN-NEXT: s_endpgm
+define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %i.01 = phi i32 [ %tmp4, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %i.01
+  %tmp2 = load i32, i32 addrspace(3)* %tmp1, align 4
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %indvars.iv
+  store i32 %tmp2, i32 addrspace(1)* %tmp3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp4 = add nuw nsw i32 %i.01, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
index 44ffc36af149..abe472e423fc 100644
--- a/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This test checks that the lds input queue will is empty at the end of
 ; the ALU clause.
@@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.r600.group.barrier()
 
   ; This will start a new clause for the vertex fetch
   %2 = load i32, i32 addrspace(1)* %in
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-declare void @llvm.AMDGPU.barrier.local()
+declare void @llvm.r600.group.barrier() nounwind convergent
 
 ; The machine scheduler does not do proper alias analysis and assumes that
 ; loads from global values (Note that a global value is different that a
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
index 3e8328659fdb..1607713090e3 100644
--- a/test/CodeGen/AMDGPU/lds-size.ll
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -1,11 +1,17 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
 
 ; This test makes sure we do not double count global values when they are
 ; used in different basic blocks.
 
-; CHECK: .long   166120
-; CHECK-NEXT: .long   1
-; CHECK-LABEL: {{^}}test:
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; EG: .long 166120
+; EG-NEXT: .long 1
+; ALL: {{^}}test:
+
+; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index fb51bc0e50c2..cb5d73fb0d8b 100644
--- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index 9d2320cb2d19..82fbb7f46186 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -54,11 +54,11 @@ entry:
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
 define void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
-  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
 attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/liveness.mir b/test/CodeGen/AMDGPU/liveness.mir
new file mode 100644
index 000000000000..ce49294d5b36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/liveness.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -run-pass liveintervals -verify-machineinstrs -o /dev/null -debug-only=regalloc %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+# We currently maintain a main liveness range which operates like a superset of
+# all subregister liveranges. We may need to create additional SSA values at
+# merge point in this main liverange even though none of the subregister
+# liveranges needed it.
+#
+# Should see three distinct value numbers:
+# CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
+--- |
+  define void @test0() { ret void }
+...
+---
+name: test0
+registers:
+  - { id: 0, class: sreg_64 }
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_NOP 0, implicit-def undef %0:sub0
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+    S_NOP 0, implicit-def %0:sub1
+    S_NOP 0, implicit %0:sub1
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_NOP 0, implicit %0:sub0
+...
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
deleted file mode 100644
index ca8ddbae9fbc..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
-
-; Legacy name
-declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_abs_i32:
-; SI: s_abs_i32
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
-  %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_abs_i32:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
-  %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
-  %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
-  store i32 %abs, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
deleted file mode 100644
index db883972d646..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_global:
-; EG: GROUP_BARRIER
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_global(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.global()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.global()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
deleted file mode 100644
index 48fb2e0b1a8d..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_local:
-; EG: GROUP_BARRIER
-
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_local(i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
-  store i32 %0, i32 addrspace(1)* %1
-  call void @llvm.AMDGPU.barrier.local()
-  %2 = call i32 @llvm.r600.read.local.size.x()
-  %3 = sub i32 %2, 1
-  %4 = sub i32 %3, %0
-  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
-  %6 = load i32, i32 addrspace(1)* %5
-  store i32 %6, i32 addrspace(1)* %1
-  ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
deleted file mode 100644
index 517a55abc098..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfi, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
deleted file mode 100644
index 50492289d744..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfm_arg_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_arg_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
-; EG: BFM_INT
-define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
-; EG: BFM_INT
-define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
-  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
-  store i32 %bfm, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
-  %a = shl i32 1, %x
-  %b = sub i32 %a, 1
-  %c = shl i32 %b, %y
-  store i32 %c, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern_simple:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
-  %a = shl i32 1, %x
-  %b = sub i32 %a, 1
-  store i32 %b, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
index 11ec963ab314..cfe4cc00ee81 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -1,10 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fabs.f32(float) nounwind readnone
 declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
-declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
 
 ; FUNC-LABEL: {{^}}clamp_0_1_f32:
 ; SI: s_load_dword [[ARG:s[0-9]+]],
@@ -55,13 +54,3 @@ define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounw
   store float %clamp, float addrspace(1)* %out, align 4
   ret void
 }
-
-; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
-  %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
index e95a51093cb7..78b88122229b 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
@@ -1,59 +1,57 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}cube:
+; CHECK-LABEL: {{^}}cube:
 ; CHECK: CUBE T{{[0-9]}}.X
 ; CHECK: CUBE T{{[0-9]}}.Y
 ; CHECK: CUBE T{{[0-9]}}.Z
 ; CHECK: CUBE * T{{[0-9]}}.W
-define void @cube() #0 {
+define amdgpu_ps void @cube() {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = fdiv float %3, %1
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %6 = extractelement <4 x float> %5, i32 1
-  %7 = fdiv float %6, %1
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %9 = extractelement <4 x float> %8, i32 2
-  %10 = fdiv float %9, %1
-  %11 = insertelement <4 x float> undef, float %4, i32 0
-  %12 = insertelement <4 x float> %11, float %7, i32 1
-  %13 = insertelement <4 x float> %12, float %10, i32 2
-  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
-  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
-  %16 = extractelement <4 x float> %15, i32 0
-  %17 = extractelement <4 x float> %15, i32 1
-  %18 = extractelement <4 x float> %15, i32 2
-  %19 = extractelement <4 x float> %15, i32 3
-  %20 = call float @fabs(float %18)
-  %21 = fdiv float 1.000000e+00, %20
-  %22 = fmul float %16, %21
-  %23 = fadd float %22, 1.500000e+00
-  %24 = fmul float %17, %21
-  %25 = fadd float %24, 1.500000e+00
-  %26 = insertelement <4 x float> undef, float %25, i32 0
-  %27 = insertelement <4 x float> %26, float %23, i32 1
-  %28 = insertelement <4 x float> %27, float %19, i32 2
-  %29 = insertelement <4 x float> %28, float %25, i32 3
-  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
-  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp1 = extractelement <4 x float> %tmp, i32 3
+  %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp3 = extractelement <4 x float> %tmp2, i32 0
+  %tmp4 = fdiv float %tmp3, %tmp1
+  %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp6 = extractelement <4 x float> %tmp5, i32 1
+  %tmp7 = fdiv float %tmp6, %tmp1
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 2
+  %tmp10 = fdiv float %tmp9, %tmp1
+  %tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0
+  %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
+  %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
+  %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
+  %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
+  %tmp16 = extractelement <4 x float> %tmp15, i32 0
+  %tmp17 = extractelement <4 x float> %tmp15, i32 1
+  %tmp18 = extractelement <4 x float> %tmp15, i32 2
+  %tmp19 = extractelement <4 x float> %tmp15, i32 3
+  %tmp20 = call float @llvm.fabs.f32(float %tmp18)
+  %tmp21 = fdiv float 1.000000e+00, %tmp20
+  %tmp22 = fmul float %tmp16, %tmp21
+  %tmp23 = fadd float %tmp22, 1.500000e+00
+  %tmp24 = fmul float %tmp17, %tmp21
+  %tmp25 = fadd float %tmp24, 1.500000e+00
+  %tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0
+  %tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1
+  %tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2
+  %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3
+  %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0)
   ret void
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
 
-; Function Attrs: readnone
-declare float @fabs(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #0
 
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
deleted file mode 100644
index 8b32f696449e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_unpack_byte0_to_float:
-; SI: v_cvt_f32_ubyte0
-define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte1_to_float:
-; SI: v_cvt_f32_ubyte1
-define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte2_to_float:
-; SI: v_cvt_f32_ubyte2
-define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte3_to_float:
-; SI: v_cvt_f32_ubyte3
-define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
-  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
deleted file mode 100644
index 6049dca04012..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.fabs.f64(double %Val)
-declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f64:
-; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %neg = fsub double 0.0, %val
-  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg_abs:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
-  %val = load double, double addrspace(1)* %src, align 4
-  %abs = call double @llvm.fabs.f64(double %val)
-  %neg = fsub double 0.0, %abs
-  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
-  store double %fract, double addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
deleted file mode 100644
index 7501b4b75465..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float  %Val)
-declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
-
-; Legacy name
-declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f32:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %neg = fsub float 0.0, %val
-  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg_abs:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float, float addrspace(1)* %src, align 4
-  %abs = call float @llvm.fabs.f32(float %val)
-  %neg = fsub float 0.0, %abs
-  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
-  store float %fract, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
deleted file mode 100644
index 42102e30f071..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FIXME: Store of i32 seems to be broken pre-EG somehow?
-
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imad24:
-; SI: v_mad_i32_i24
-; CM: MULADD_INT24
-; R600: MULLO_INT
-; R600: ADD_INT
-define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
deleted file mode 100644
index 46662f96c290..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imax:
-; SI: v_max_i32_e32
-define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_imax:
-; SI: s_max_i32
-define void @scalar_imax(i32 %p0, i32 %p1) #0 {
-entry:
-  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
deleted file mode 100644
index 34b454e23755..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imin:
-; SI: v_min_i32_e32
-define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_imin:
-; SI: s_min_i32
-define void @scalar_imin(i32 %p0, i32 %p1) #0 {
-entry:
-  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
deleted file mode 100644
index fdc1172260b9..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imul24:
-; SI: v_mul_i32_i24
-; CM: MUL_INT24
-; R600: MULLO_INT
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %mul, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
index 057708e7b5cc..59997d27683d 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -5,7 +5,7 @@
 ; SI-NOT: v_cmpx_le_f32
 ; SI: s_mov_b64 exec, 0
 
-define void @kill_gs_const() #0 {
+define amdgpu_gs void @kill_gs_const() {
 main_body:
   %0 = icmp ule i32 0, 3
   %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
@@ -21,7 +21,7 @@ main_body:
 ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
   call void @llvm.AMDGPU.kill(float %14)
@@ -33,7 +33,4 @@ entry:
 declare void @llvm.AMDGPU.kill(float)
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="2" }
-attributes #1 = { "ShaderType"="0" }
-
 !0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
deleted file mode 100644
index a59c0ce6d675..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
-declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_ldexp_f32:
-; SI: v_ldexp_f32
-; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
-  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: {{^}}test_ldexp_f64:
-; SI: v_ldexp_f64
-; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
-  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
deleted file mode 100644
index 4cafd563685e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_legacy_f32:
-; SI: v_rsq_legacy_f32_e32
-; EG: RECIPSQRT_IEEE
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
deleted file mode 100644
index 83b56a5029d3..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = extractelement <4 x float> %reg0, i32 1
-   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
-   %vec = insertelement <4 x float> undef, float %r2, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare float @llvm.AMDGPU.mul(float ,float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
deleted file mode 100644
index d2a655bf909c..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rcp_pat_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
-; SI-UNSAFE: v_rsq_f64_e32
-; SI-SAFE-NOT: v_rsq_f64_e32
-; SI-SAFE: v_sqrt_f64
-; SI-SAFE: v_rcp_f64
-define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
deleted file mode 100644
index edd6e9a72f1b..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f32:
-; SI: v_rcp_f32_e32
-; EG: RECIP_IEEE
-define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FIXME: Evergreen only ever does unsafe fp math.
-; FUNC-LABEL: {{^}}rcp_pat_f32:
-
-; SI-SAFE: v_rcp_f32_e32
-; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
-
-; EG: RECIP_IEEE
-
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rcp = fdiv float 1.0, %src
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
-; SI-UNSAFE: v_rsq_f32_e32
-; SI-SAFE: v_sqrt_f32_e32
-; SI-SAFE: v_rcp_f32_e32
-
-; EG: RECIPSQRT_IEEE
-define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
-  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
-  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
-  store float %rcp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
deleted file mode 100644
index 67f1d22c7178..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f64:
-; SI: v_rsq_clamp_f64_e32
-
-; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
-; TODO: this constant should be folded:
-; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
-; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
-; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
-; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
-; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
-; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
-
-define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
-  store double %rsq_clamped, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
deleted file mode 100644
index eeff2536b232..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f32:
-; SI: v_rsq_clamp_f32_e32
-
-; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
-; TODO: this constant should be folded:
-; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
-; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
-
-; EG: RECIPSQRT_CLAMPED
-
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
-  store float %rsq_clamped, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
deleted file mode 100644
index 36b72f14db19..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_f32:
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; TODO: Really these should be constant folded
-; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
-  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
-  store float %rsq, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
deleted file mode 100644
index 10206609bb57..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
-   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
-   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
-   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
-   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
-   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
-   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
-   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
-   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
-   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
-   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
-   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
-   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
-   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
-   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
-   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
-   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
-   store <4 x float> %res16, <4 x float> addrspace(1)* %out
-   ret void
-}
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
deleted file mode 100644
index a30a8e083eb6..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: {{^}}amdgpu_trunc:
-; SI: v_trunc_f32
-
-define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
-entry:
-  %0 = call float @llvm.AMDGPU.trunc(float %x)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
deleted file mode 100644
index 77a073b0cb03..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umad24:
-; SI: v_mad_u32_u24
-; EG: MULADD_UINT24
-; R600: MULLO_UINT
-; R600: ADD_INT
-define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}commute_umad24:
-; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
-; SI: buffer_store_dword [[RESULT]]
-define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
-
-  %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
-  %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
-  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
-  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
deleted file mode 100644
index a97d103016d3..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umax:
-; SI: v_max_u32_e32
-define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_umax:
-; SI: s_max_u32
-define void @scalar_umax(i32 %p0, i32 %p1) #0 {
-entry:
-  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %max to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umax:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
-  %tmp2 = zext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
-  %tmp4 = trunc i32 %tmp3 to i8
-  %tmp6 = zext i8 %tmp4 to i16
-  store i16 %tmp6, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
deleted file mode 100644
index 2acd10e0c631..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umin:
-; SI: v_min_u32_e32
-define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}scalar_umin:
-; SI: s_min_u32
-define void @scalar_umin(i32 %p0, i32 %p1) #0 {
-entry:
-  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
-  %bc = bitcast i32 %min to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
-  ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umin:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
-  %tmp2 = zext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
-  %tmp4 = trunc i32 %tmp3 to i8
-  %tmp6 = zext i8 %tmp4 to i16
-  store i16 %tmp6, i16 addrspace(1)* %out, align 2
-  ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
deleted file mode 100644
index 76624a078b3a..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umul24:
-; SI: v_mul_u32_u24
-; R600: MUL_UINT24
-; R600: MULLO_UINT
-define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
-  store i32 %mul, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
index fdc324087015..ca1faebb77e7 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -10,7 +10,7 @@
 ;GCN: v_interp_p1_f32
 ;GCN: v_interp_p2_f32
 
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
 main_body:
   %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
   %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
@@ -25,7 +25,7 @@ main_body:
 ; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
 ; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
 
-define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 main_body:
   %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
   %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
@@ -42,19 +42,18 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @fabs(float) #2
+declare float @fabs(float) #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
+declare i32 @llvm.SI.packf16(float, float) #0
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.constant(i32, i32, i32) #1
+declare float @llvm.SI.fs.constant(i32, i32, i32) #0
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
index 275cb580bc9b..aef9f660436e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
@@ -2,10 +2,10 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}gather4_v2:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_v2() #0 {
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_v2() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -15,10 +15,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4() #0 {
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -28,10 +28,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_cl:
-;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl() #0 {
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -41,10 +41,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_l:
-;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l() #0 {
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -54,10 +54,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b:
-;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b() #0 {
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -67,10 +67,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b_cl:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl() #0 {
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -80,10 +80,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b_cl_v8:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_v8() #0 {
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -93,10 +93,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_lz_v2:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_v2() #0 {
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz_v2() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -106,10 +106,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_lz:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz() #0 {
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -121,10 +121,10 @@ main_body:
 
 
 ;CHECK-LABEL: {{^}}gather4_o:
-;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_o() #0 {
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -134,10 +134,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_cl_o:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o() #0 {
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -147,10 +147,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_cl_o_v8:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o_v8() #0 {
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl_o_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -160,10 +160,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_l_o:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o() #0 {
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -173,10 +173,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_l_o_v8:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o_v8() #0 {
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l_o_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -186,10 +186,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b_o:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o() #0 {
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -199,10 +199,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b_o_v8:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o_v8() #0 {
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_o_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -212,10 +212,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_b_cl_o:
-;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_o() #0 {
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -225,10 +225,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_lz_o:
-;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_o() #0 {
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -240,10 +240,10 @@ main_body:
 
 
 ;CHECK-LABEL: {{^}}gather4_c:
-;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c() #0 {
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -253,10 +253,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_cl:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl() #0 {
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -266,10 +266,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_cl_v8:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_v8() #0 {
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -279,10 +279,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_l:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l() #0 {
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -292,10 +292,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_l_v8:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_v8() #0 {
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -305,10 +305,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_b:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b() #0 {
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -318,10 +318,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_b_v8:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_v8() #0 {
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -331,10 +331,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_b_cl:
-;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl() #0 {
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_cl() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -344,10 +344,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_lz:
-;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz() #0 {
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -359,10 +359,10 @@ main_body:
 
 
 ;CHECK-LABEL: {{^}}gather4_c_o:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o() #0 {
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -372,10 +372,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_o_v8:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o_v8() #0 {
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_o_v8() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -385,10 +385,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_cl_o:
-;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_o() #0 {
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -398,10 +398,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_l_o:
-;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_o() #0 {
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -411,10 +411,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_b_o:
-;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_o() #0 {
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -424,10 +424,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
-;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl_o() #0 {
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_cl_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -437,10 +437,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_lz_o:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o() #0 {
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz_o() {
 main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
@@ -450,60 +450,76 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o_v8() #0 {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz_o_v8() {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_sgpr_bug:
+;
+; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
+; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
+;
+;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
+;CHECK: s_waitcnt lgkmcnt(0)
+;CHECK: s_mov_b32 s[[LO]], 0
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
+define amdgpu_ps float @gather4_sgpr_bug() {
+main_body:
+  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16
+  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
+  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp4 = extractelement <4 x float> %tmp2, i32 1
+  %tmp9 = fadd float undef, %tmp4
+  ret float %tmp9
+}
+
+declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
index 06ee98e91b31..ac34d31b97c1 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
@@ -2,10 +2,10 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}getlod:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod() {
 main_body:
-  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -13,10 +13,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}getlod_v2:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v2() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod_v2() {
 main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -24,10 +24,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}getlod_v4:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v4() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod_v4() {
 main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -35,11 +35,10 @@ main_body:
 }
 
 
-declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
index 0fac8d799562..50341e3e207f 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.ll
@@ -2,8 +2,8 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}image_load:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load() #0 {
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @image_load() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -15,8 +15,8 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load_mip() #0 {
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @image_load_mip() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -28,8 +28,8 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getresinfo() #0 {
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @getresinfo() {
 main_body:
   %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -40,11 +40,10 @@ main_body:
   ret void
 }
 
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
index ce9558cbf81d..7cdd9559994e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
@@ -2,11 +2,11 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
 
 ; CHECK-LABEL: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
-define void @v1(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd
+define amdgpu_ps void @v1(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 0
   %3 = extractelement <4 x float> %1, i32 2
   %4 = extractelement <4 x float> %1, i32 3
@@ -15,11 +15,11 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v2:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
-define void @v2(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb
+define amdgpu_ps void @v2(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 0
   %3 = extractelement <4 x float> %1, i32 1
   %4 = extractelement <4 x float> %1, i32 3
@@ -28,11 +28,11 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v3:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-define void @v3(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe
+define amdgpu_ps void @v3(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 1
   %3 = extractelement <4 x float> %1, i32 2
   %4 = extractelement <4 x float> %1, i32 3
@@ -41,11 +41,11 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v4:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
-define void @v4(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7
+define amdgpu_ps void @v4(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 0
   %3 = extractelement <4 x float> %1, i32 1
   %4 = extractelement <4 x float> %1, i32 2
@@ -54,11 +54,11 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v5:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-define void @v5(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa
+define amdgpu_ps void @v5(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 1
   %3 = extractelement <4 x float> %1, i32 3
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
@@ -66,11 +66,11 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v6:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
-define void @v6(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6
+define amdgpu_ps void @v6(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 1
   %3 = extractelement <4 x float> %1, i32 2
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
@@ -78,19 +78,17 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}v7:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
-define void @v7(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9
+define amdgpu_ps void @v7(i32 %a1) {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %2 = extractelement <4 x float> %1, i32 0
   %3 = extractelement <4 x float> %1, i32 3
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
   ret void
 }
 
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
index 4bc638a28063..60077dc218fd 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
@@ -3,8 +3,8 @@
 
 ;CHECK-LABEL: {{^}}sample:
 ;CHECK: s_wqm
-;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -17,8 +17,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -31,8 +31,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_d:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -45,8 +45,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_d_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -59,8 +59,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_l:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_l() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -73,8 +73,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_b:
 ;CHECK: s_wqm
-;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -87,8 +87,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_b_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -101,8 +101,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_lz:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_lz() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -115,8 +115,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cd:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -129,8 +129,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cd_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -143,8 +143,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c:
 ;CHECK: s_wqm
-;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -157,8 +157,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -171,8 +171,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_d:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -185,8 +185,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_d_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -199,8 +199,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_l:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_l() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -213,8 +213,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_b:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -227,8 +227,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_b_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -241,8 +241,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_lz:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_lz() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -255,8 +255,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cd:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -269,8 +269,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cd_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -282,29 +282,28 @@ main_body:
 }
 
 
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
index 9d8935414ed9..34d4f6825690 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
@@ -3,8 +3,8 @@
 
 ;CHECK-LABEL: {{^}}sample:
 ;CHECK: s_wqm
-;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -17,8 +17,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -31,8 +31,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_d:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -45,8 +45,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_d_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -59,8 +59,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_l:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_l() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -73,8 +73,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_b:
 ;CHECK: s_wqm
-;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -87,8 +87,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_b_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -101,8 +101,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_lz:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_lz() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -115,8 +115,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cd:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -129,8 +129,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_cd_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -143,8 +143,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -157,8 +157,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -171,8 +171,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_d:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -185,8 +185,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_d_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -199,8 +199,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_l:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_l() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -213,8 +213,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_b:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -227,8 +227,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_b_cl:
 ;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -241,8 +241,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_lz:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_lz() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -255,8 +255,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cd:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -269,8 +269,8 @@ main_body:
 
 ;CHECK-LABEL: {{^}}sample_c_cd_cl:
 ;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd_cl() {
 main_body:
   %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %r0 = extractelement <4 x float> %r, i32 0
@@ -282,29 +282,28 @@ main_body:
 }
 
 
-declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
deleted file mode 100644
index b67716c3b665..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
-      <32 x i8> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
-      <32 x i8> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
-      <32 x i8> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
-      <32 x i8> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
-      <32 x i8> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
-      <32 x i8> undef, i32 6)
-   %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
-      <32 x i8> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
-      <32 x i8> undef, i32 11)
-   %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
-      <32 x i8> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
-      <32 x i8> undef, i32 16)
-   %e1 = extractelement <4 x i32> %res1, i32 0
-   %e2 = extractelement <4 x i32> %res2, i32 1
-   %e3 = extractelement <4 x i32> %res3, i32 2
-   %e4 = extractelement <4 x i32> %res4, i32 3
-   %t0 = extractelement <4 x i32> %res5, i32 0
-   %t1 = extractelement <4 x i32> %res5, i32 1
-   %e5 = add i32 %t0, %t1
-   %t2 = extractelement <4 x i32> %res6, i32 0
-   %t3 = extractelement <4 x i32> %res6, i32 2
-   %e6 = add i32 %t2, %t3
-   %t10 = extractelement <4 x i32> %res10, i32 2
-   %t11 = extractelement <4 x i32> %res10, i32 3
-   %e10 = add i32 %t10, %t11
-   %t12 = extractelement <4 x i32> %res11, i32 0
-   %t13 = extractelement <4 x i32> %res11, i32 1
-   %t14 = extractelement <4 x i32> %res11, i32 2
-   %t15 = add i32 %t12, %t13
-   %e11 = add i32 %t14, %t15
-   %t28 = extractelement <4 x i32> %res15, i32 0
-   %t29 = extractelement <4 x i32> %res15, i32 1
-   %t30 = extractelement <4 x i32> %res15, i32 2
-   %t31 = extractelement <4 x i32> %res15, i32 3
-   %t32 = add i32 %t28, %t29
-   %t33 = add i32 %t30, %t31
-   %e15 = add i32 %t32, %t33
-   %e16 = extractelement <4 x i32> %res16, i32 3
-   %s1 = add i32 %e1, %e2
-   %s2 = add i32 %s1, %e3
-   %s3 = add i32 %s2, %e4
-   %s4 = add i32 %s3, %e5
-   %s5 = add i32 %s4, %e6
-   %s9 = add i32 %s5, %e10
-   %s10 = add i32 %s9, %e11
-   %s14 = add i32 %s10, %e15
-   %s15 = add i32 %s14, %e16
-   %s16 = bitcast i32 %s15 to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
-   ret void
-}
-
-; Test that ccordinates are stored in vgprs and not sgprs
-; CHECK: vgpr_coords
-; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
-define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
-  %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
-  %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
-  %22 = getelementptr float, float addrspace(2)* %21, i32 0
-  %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
-  %24 = getelementptr float, float addrspace(2)* %21, i32 1
-  %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
-  %26 = getelementptr float, float addrspace(2)* %21, i32 4
-  %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
-  %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
-  %30 = bitcast float %27 to i32
-  %31 = bitcast float %23 to i32
-  %32 = bitcast float %25 to i32
-  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
-  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
-  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
-  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
-  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
-  %38 = extractelement <4 x i32> %37, i32 0
-  %39 = extractelement <4 x i32> %37, i32 1
-  %40 = extractelement <4 x i32> %37, i32 2
-  %41 = extractelement <4 x i32> %37, i32 3
-  %42 = bitcast i32 %38 to float
-  %43 = bitcast i32 %39 to float
-  %44 = bitcast i32 %40 to float
-  %45 = bitcast i32 %41 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
-  ret void
-}
-
-declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null}
-!1 = !{}
-!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
index f6c258539d5b..d0cc00d81b4e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -7,14 +7,14 @@
 ; FIXME: Out of bounds immediate offset crashes
 
 ; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
 ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
 
-define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
+define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
 main_body:
   %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -40,14 +40,13 @@ main_body:
 }
 
 ; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 ; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readonly }
+attributes #0 = { nounwind readonly }
 
 !0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
index 0155757632d4..6984b4cf488a 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
@@ -6,7 +6,7 @@
 ; GCN: v_cvt_pkrtz_f16_f32
 ; GCN-NOT: v_cvt_pkrtz_f16_f32
 
-define void @main(float %src) #0 {
+define amdgpu_ps void @main(float %src) {
 main_body:
   %p1 = call i32 @llvm.SI.packf16(float undef, float %src)
   %p2 = call i32 @llvm.SI.packf16(float %src, float undef)
@@ -21,9 +21,8 @@ main_body:
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
+declare i32 @llvm.SI.packf16(float, float) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
deleted file mode 100644
index ac95fd0b83a2..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
-		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
-   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
-   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
-   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
-   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
-   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
-   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
-   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
-   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
-   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
-   %e1 = extractelement <4 x i32> %res1, i32 0
-   %e2 = extractelement <4 x i32> %res2, i32 1
-   %e3 = extractelement <4 x i32> %res3, i32 2
-   %e4 = extractelement <4 x i32> %res4, i32 3
-   %t0 = extractelement <4 x i32> %res5, i32 0
-   %t1 = extractelement <4 x i32> %res5, i32 1
-   %e5 = add i32 %t0, %t1
-   %t2 = extractelement <4 x i32> %res6, i32 0
-   %t3 = extractelement <4 x i32> %res6, i32 2
-   %e6 = add i32 %t2, %t3
-   %t4 = extractelement <4 x i32> %res7, i32 0
-   %t5 = extractelement <4 x i32> %res7, i32 3
-   %e7 = add i32 %t4, %t5
-   %t6 = extractelement <4 x i32> %res8, i32 1
-   %t7 = extractelement <4 x i32> %res8, i32 2
-   %e8 = add i32 %t6, %t7
-   %t8 = extractelement <4 x i32> %res9, i32 1
-   %t9 = extractelement <4 x i32> %res9, i32 3
-   %e9 = add i32 %t8, %t9
-   %t10 = extractelement <4 x i32> %res10, i32 2
-   %t11 = extractelement <4 x i32> %res10, i32 3
-   %e10 = add i32 %t10, %t11
-   %t12 = extractelement <4 x i32> %res11, i32 0
-   %t13 = extractelement <4 x i32> %res11, i32 1
-   %t14 = extractelement <4 x i32> %res11, i32 2
-   %t15 = add i32 %t12, %t13
-   %e11 = add i32 %t14, %t15
-   %t16 = extractelement <4 x i32> %res12, i32 0
-   %t17 = extractelement <4 x i32> %res12, i32 1
-   %t18 = extractelement <4 x i32> %res12, i32 3
-   %t19 = add i32 %t16, %t17
-   %e12 = add i32 %t18, %t19
-   %t20 = extractelement <4 x i32> %res13, i32 0
-   %t21 = extractelement <4 x i32> %res13, i32 2
-   %t22 = extractelement <4 x i32> %res13, i32 3
-   %t23 = add i32 %t20, %t21
-   %e13 = add i32 %t22, %t23
-   %t24 = extractelement <4 x i32> %res14, i32 1
-   %t25 = extractelement <4 x i32> %res14, i32 2
-   %t26 = extractelement <4 x i32> %res14, i32 3
-   %t27 = add i32 %t24, %t25
-   %e14 = add i32 %t26, %t27
-   %t28 = extractelement <4 x i32> %res15, i32 0
-   %t29 = extractelement <4 x i32> %res15, i32 1
-   %t30 = extractelement <4 x i32> %res15, i32 2
-   %t31 = extractelement <4 x i32> %res15, i32 3
-   %t32 = add i32 %t28, %t29
-   %t33 = add i32 %t30, %t31
-   %e15 = add i32 %t32, %t33
-   %e16 = extractelement <4 x i32> %res16, i32 3
-   %s1 = add i32 %e1, %e2
-   %s2 = add i32 %s1, %e3
-   %s3 = add i32 %s2, %e4
-   %s4 = add i32 %s3, %e5
-   %s5 = add i32 %s4, %e6
-   %s6 = add i32 %s5, %e7
-   %s7 = add i32 %s6, %e8
-   %s8 = add i32 %s7, %e9
-   %s9 = add i32 %s8, %e10
-   %s10 = add i32 %s9, %e11
-   %s11 = add i32 %s10, %e12
-   %s12 = add i32 %s11, %e13
-   %s13 = add i32 %s12, %e14
-   %s14 = add i32 %s13, %e15
-   %s15 = add i32 %s14, %e16
-   %s16 = bitcast i32 %s15 to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
-   ret void
-}
-
-declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
deleted file mode 100644
index 509c45f588b8..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sample.ll
+++ /dev/null
@@ -1,160 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
-   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
-   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
-   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
-   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
-      <32 x i8> undef, <16 x i8> undef, i32 1)
-   %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
-      <32 x i8> undef, <16 x i8> undef, i32 2)
-   %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
-      <32 x i8> undef, <16 x i8> undef, i32 3)
-   %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
-      <32 x i8> undef, <16 x i8> undef, i32 4)
-   %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
-      <32 x i8> undef, <16 x i8> undef, i32 5)
-   %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
-      <32 x i8> undef, <16 x i8> undef, i32 6)
-   %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
-      <32 x i8> undef, <16 x i8> undef, i32 7)
-   %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
-      <32 x i8> undef, <16 x i8> undef, i32 8)
-   %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
-      <32 x i8> undef, <16 x i8> undef, i32 9)
-   %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
-      <32 x i8> undef, <16 x i8> undef, i32 10)
-   %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
-      <32 x i8> undef, <16 x i8> undef, i32 11)
-   %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
-      <32 x i8> undef, <16 x i8> undef, i32 12)
-   %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
-      <32 x i8> undef, <16 x i8> undef, i32 13)
-   %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
-      <32 x i8> undef, <16 x i8> undef, i32 14)
-   %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
-      <32 x i8> undef, <16 x i8> undef, i32 15)
-   %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
-      <32 x i8> undef, <16 x i8> undef, i32 16)
-   %e1 = extractelement <4 x float> %res1, i32 0
-   %e2 = extractelement <4 x float> %res2, i32 1
-   %e3 = extractelement <4 x float> %res3, i32 2
-   %e4 = extractelement <4 x float> %res4, i32 3
-   %t0 = extractelement <4 x float> %res5, i32 0
-   %t1 = extractelement <4 x float> %res5, i32 1
-   %e5 = fadd float %t0, %t1
-   %t2 = extractelement <4 x float> %res6, i32 0
-   %t3 = extractelement <4 x float> %res6, i32 2
-   %e6 = fadd float %t2, %t3
-   %t4 = extractelement <4 x float> %res7, i32 0
-   %t5 = extractelement <4 x float> %res7, i32 3
-   %e7 = fadd float %t4, %t5
-   %t6 = extractelement <4 x float> %res8, i32 1
-   %t7 = extractelement <4 x float> %res8, i32 2
-   %e8 = fadd float %t6, %t7
-   %t8 = extractelement <4 x float> %res9, i32 1
-   %t9 = extractelement <4 x float> %res9, i32 3
-   %e9 = fadd float %t8, %t9
-   %t10 = extractelement <4 x float> %res10, i32 2
-   %t11 = extractelement <4 x float> %res10, i32 3
-   %e10 = fadd float %t10, %t11
-   %t12 = extractelement <4 x float> %res11, i32 0
-   %t13 = extractelement <4 x float> %res11, i32 1
-   %t14 = extractelement <4 x float> %res11, i32 2
-   %t15 = fadd float %t12, %t13
-   %e11 = fadd float %t14, %t15
-   %t16 = extractelement <4 x float> %res12, i32 0
-   %t17 = extractelement <4 x float> %res12, i32 1
-   %t18 = extractelement <4 x float> %res12, i32 3
-   %t19 = fadd float %t16, %t17
-   %e12 = fadd float %t18, %t19
-   %t20 = extractelement <4 x float> %res13, i32 0
-   %t21 = extractelement <4 x float> %res13, i32 2
-   %t22 = extractelement <4 x float> %res13, i32 3
-   %t23 = fadd float %t20, %t21
-   %e13 = fadd float %t22, %t23
-   %t24 = extractelement <4 x float> %res14, i32 1
-   %t25 = extractelement <4 x float> %res14, i32 2
-   %t26 = extractelement <4 x float> %res14, i32 3
-   %t27 = fadd float %t24, %t25
-   %e14 = fadd float %t26, %t27
-   %t28 = extractelement <4 x float> %res15, i32 0
-   %t29 = extractelement <4 x float> %res15, i32 1
-   %t30 = extractelement <4 x float> %res15, i32 2
-   %t31 = extractelement <4 x float> %res15, i32 3
-   %t32 = fadd float %t28, %t29
-   %t33 = fadd float %t30, %t31
-   %e15 = fadd float %t32, %t33
-   %e16 = extractelement <4 x float> %res16, i32 3
-   %s1 = fadd float %e1, %e2
-   %s2 = fadd float %s1, %e3
-   %s3 = fadd float %s2, %e4
-   %s4 = fadd float %s3, %e5
-   %s5 = fadd float %s4, %e6
-   %s6 = fadd float %s5, %e7
-   %s7 = fadd float %s6, %e8
-   %s8 = fadd float %s7, %e9
-   %s9 = fadd float %s8, %e10
-   %s10 = fadd float %s9, %e11
-   %s11 = fadd float %s10, %e12
-   %s12 = fadd float %s11, %e13
-   %s13 = fadd float %s12, %e14
-   %s14 = fadd float %s13, %e15
-   %s15 = fadd float %s14, %e16
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
-   ret void
-}
-
-; CHECK: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-define void @v1(i32 %a1) #0 {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 2
-  %5 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
deleted file mode 100644
index f2badff2a99c..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
-   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
-   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
-   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
-   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
-   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
-   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
-   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
-   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
-   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
-   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
-   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
-   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
-   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
-   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
-   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
-   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
-   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
-      <32 x i8> undef, <16 x i8> undef, i32 1)
-   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
-      <32 x i8> undef, <16 x i8> undef, i32 2)
-   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
-      <32 x i8> undef, <16 x i8> undef, i32 3)
-   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
-      <32 x i8> undef, <16 x i8> undef, i32 4)
-   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
-      <32 x i8> undef, <16 x i8> undef, i32 5)
-   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
-      <32 x i8> undef, <16 x i8> undef, i32 6)
-   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
-      <32 x i8> undef, <16 x i8> undef, i32 7)
-   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
-      <32 x i8> undef, <16 x i8> undef, i32 8)
-   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
-      <32 x i8> undef, <16 x i8> undef, i32 9)
-   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
-      <32 x i8> undef, <16 x i8> undef, i32 10)
-   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
-      <32 x i8> undef, <16 x i8> undef, i32 11)
-   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
-      <32 x i8> undef, <16 x i8> undef, i32 12)
-   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
-      <32 x i8> undef, <16 x i8> undef, i32 13)
-   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
-      <32 x i8> undef, <16 x i8> undef, i32 14)
-   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
-      <32 x i8> undef, <16 x i8> undef, i32 15)
-   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
-      <32 x i8> undef, <16 x i8> undef, i32 16)
-   %e1 = extractelement <4 x float> %res1, i32 0
-   %e2 = extractelement <4 x float> %res2, i32 1
-   %e3 = extractelement <4 x float> %res3, i32 2
-   %e4 = extractelement <4 x float> %res4, i32 3
-   %t0 = extractelement <4 x float> %res5, i32 0
-   %t1 = extractelement <4 x float> %res5, i32 1
-   %e5 = fadd float %t0, %t1
-   %t2 = extractelement <4 x float> %res6, i32 0
-   %t3 = extractelement <4 x float> %res6, i32 2
-   %e6 = fadd float %t2, %t3
-   %t4 = extractelement <4 x float> %res7, i32 0
-   %t5 = extractelement <4 x float> %res7, i32 3
-   %e7 = fadd float %t4, %t5
-   %t6 = extractelement <4 x float> %res8, i32 1
-   %t7 = extractelement <4 x float> %res8, i32 2
-   %e8 = fadd float %t6, %t7
-   %t8 = extractelement <4 x float> %res9, i32 1
-   %t9 = extractelement <4 x float> %res9, i32 3
-   %e9 = fadd float %t8, %t9
-   %t10 = extractelement <4 x float> %res10, i32 2
-   %t11 = extractelement <4 x float> %res10, i32 3
-   %e10 = fadd float %t10, %t11
-   %t12 = extractelement <4 x float> %res11, i32 0
-   %t13 = extractelement <4 x float> %res11, i32 1
-   %t14 = extractelement <4 x float> %res11, i32 2
-   %t15 = fadd float %t12, %t13
-   %e11 = fadd float %t14, %t15
-   %t16 = extractelement <4 x float> %res12, i32 0
-   %t17 = extractelement <4 x float> %res12, i32 1
-   %t18 = extractelement <4 x float> %res12, i32 3
-   %t19 = fadd float %t16, %t17
-   %e12 = fadd float %t18, %t19
-   %t20 = extractelement <4 x float> %res13, i32 0
-   %t21 = extractelement <4 x float> %res13, i32 2
-   %t22 = extractelement <4 x float> %res13, i32 3
-   %t23 = fadd float %t20, %t21
-   %e13 = fadd float %t22, %t23
-   %t24 = extractelement <4 x float> %res14, i32 1
-   %t25 = extractelement <4 x float> %res14, i32 2
-   %t26 = extractelement <4 x float> %res14, i32 3
-   %t27 = fadd float %t24, %t25
-   %e14 = fadd float %t26, %t27
-   %t28 = extractelement <4 x float> %res15, i32 0
-   %t29 = extractelement <4 x float> %res15, i32 1
-   %t30 = extractelement <4 x float> %res15, i32 2
-   %t31 = extractelement <4 x float> %res15, i32 3
-   %t32 = fadd float %t28, %t29
-   %t33 = fadd float %t30, %t31
-   %e15 = fadd float %t32, %t33
-   %e16 = extractelement <4 x float> %res16, i32 3
-   %s1 = fadd float %e1, %e2
-   %s2 = fadd float %s1, %e3
-   %s3 = fadd float %s2, %e4
-   %s4 = fadd float %s3, %e5
-   %s5 = fadd float %s4, %e6
-   %s6 = fadd float %s5, %e7
-   %s7 = fadd float %s6, %e8
-   %s8 = fadd float %s7, %e9
-   %s9 = fadd float %s8, %e10
-   %s10 = fadd float %s9, %e11
-   %s11 = fadd float %s10, %e12
-   %s12 = fadd float %s11, %e13
-   %s13 = fadd float %s12, %e14
-   %s14 = fadd float %s13, %e15
-   %s15 = fadd float %s14, %e16
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
index 2198590f2dfe..2d4987643a2b 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@@ -1,20 +1,17 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
-; BOTH-LABEL: {{^}}main:
-; BOTH: s_mov_b32 m0, s0
+; GCN-LABEL: {{^}}main:
+; GCN: s_mov_b32 m0, s0
 ; VI-NEXT: s_nop 0
-; BOTH-NEXT: s_sendmsg Gs_done(nop)
-; BOTH-NEXT: s_endpgm
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
 
-define void @main(i32 inreg %a) #0 {
-main_body:
+define amdgpu_gs void @main(i32 inreg %a) #0 {
   call void @llvm.SI.sendmsg(i32 3, i32 %a)
   ret void
 }
 
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #1
+declare void @llvm.SI.sendmsg(i32, i32) #0
 
-attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
index 09675d503355..c4bb27676e7d 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
@@ -4,10 +4,10 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_mov_b32 m0, 0
 ; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg Gs(emit stream 0)
-; CHECK: s_sendmsg Gs(cut stream 1)
-; CHECK: s_sendmsg Gs(emit-cut stream 2)
-; CHECK: s_sendmsg Gs_done(nop)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
 
 define void @main() {
 main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
index 71f51548a5f8..645c6a6b8d7e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -3,7 +3,7 @@
 
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test1(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
@@ -13,7 +13,7 @@ define void @test1(i32 %a1, i32 %vaddr) #0 {
 
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test2(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
@@ -23,7 +23,7 @@ define void @test2(i32 %a1, i32 %vaddr) #0 {
 
 ;CHECK-LABEL: {{^}}test3:
 ;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test3(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
         i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
@@ -33,7 +33,7 @@ define void @test3(i32 %a1, i32 %vaddr) #0 {
 
 ;CHECK-LABEL: {{^}}test4:
 ;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test4(i32 %vdata, i32 %vaddr) #0 {
+define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
@@ -43,5 +43,3 @@ define void @test4(i32 %vdata, i32 %vaddr) #0 {
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
deleted file mode 100644
index f6e6d7050ba7..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.tid.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
-
-;GCN: v_mbcnt_lo_u32_b32_e64
-;SI: v_mbcnt_hi_u32_b32_e32
-;VI: v_mbcnt_hi_u32_b32_e64
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
-main_body:
-  %4 = call i32 @llvm.SI.tid()
-  %5 = bitcast i32 %4 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
-}
-
-declare i32 @llvm.SI.tid() readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
new file mode 100644
index 000000000000..93911d4a91f1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -0,0 +1,387 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
+define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
+; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
+  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
+; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  store i32 %result, i32 addrspace(4)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
+  %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  store i32 %result, i32 addrspace(4)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+  %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  store i64 %result, i64 addrspace(4)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+  %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
+  %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  store i64 %result, i64 addrspace(4)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+  %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  ret void
+}
+
+@lds0 = addrspace(3) global [512 x i32] undef
+
+; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
+define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i32 %val0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
+  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  ret void
+}
+
+@lds1 = addrspace(3) global [512 x i64] undef, align 8
+
+; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
+define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i64 %val0, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
new file mode 100644
index 000000000000..181d68c8ea75
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -0,0 +1,383 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
+define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
+; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
+  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  store i32 %result, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
+; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  ret void
+}
+
+@lds0 = addrspace(3) global [512 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i32 %val0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
+  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  store i64 %result, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+  store i32 %result, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  store i32 %result, i32 addrspace(4)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
+  %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  store i32 %result, i32 addrspace(4)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+  %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  ret void
+}
+
+@lds1 = addrspace(3) global [512 x i64] undef, align 8
+
+; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
+define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i64 %val0, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
+
+
+
+
+
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+  store i64 %result, i64 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  store i64 %result, i64 addrspace(4)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+  %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
+  %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  store i64 %result, i64 addrspace(4)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+  %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
new file mode 100644
index 000000000000..98f7058b5ef8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -0,0 +1,126 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
+;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
+define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
+  %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
+  %ofs.5 = add i32 %voffset, 42
+  %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
+  %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
+  %unused = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %out = bitcast i32 %o6 to float
+  ret float %out
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
+define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
+main_body:
+  %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t7 = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t8 = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %out = bitcast i32 %t9 to float
+  ret float %out
+}
+
+; Ideally, we would teach tablegen & friends that cmpswap only modifies the
+; first vgpr. Since we don't do that yet, the register allocator will have to
+; create copies which we don't bother to track here.
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
+;CHECK: s_waitcnt vmcnt(0)
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
+;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
+define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+  %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
+  %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
+  %ofs.5 = add i32 %voffset, 42
+  %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
+  %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
+
+; Detecting the no-return variant doesn't work right now because of how the
+; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
+; Since there probably isn't a reasonable use-case of cmpswap that discards
+; the return value, that seems okay.
+;
+;  %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %out = bitcast i32 %o6 to float
+  ret float %out
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: buffer_atomic_add v0,
+define amdgpu_ps float @test4() {
+main_body:
+  %v = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false)
+  %v.float = bitcast i32 %v to float
+  ret float %v.float
+}
+
+declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
new file mode 100644
index 000000000000..67c7baba3e14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -0,0 +1,133 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+  ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
+;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
+;VI: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
+  %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
+  %d.3 = fadd <4 x float> %d.0, %d.1
+  %data = fadd <4 x float> %d.2, %d.3
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
+;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
+;VI-NOT: s_mov
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
+;VI: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
+main_body:
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
+  %data = fadd <4 x float> %d.0, %d.1
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+  %ofs = add i32 %1, 58
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x:
+;CHECK: buffer_load_format_x v0, off, s[0:3], 0
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
+main_body:
+  %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
+  ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_xy:
+;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+main_body:
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
+  ret <2 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
new file mode 100644
index 000000000000..010ad276da10
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -0,0 +1,119 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+  ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
+;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
+;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+  %ofs = add i32 %1, 58
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1:
+;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2:
+;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret <2 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_negative_offset:
+;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
+;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
+define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+main_body:
+  %ofs.1 = add i32 %ofs, -16
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
new file mode 100644
index 000000000000..555a1d23ebe9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
new file mode 100644
index 000000000000..5ae255c7a26c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 80eb3b93f8e5..668c669e41e8 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i1 @llvm.amdgcn.class.f32(float, i32) #1
+declare i1 @llvm.amdgcn.class.f64(double, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fabs.f32(float) #1
 declare double @llvm.fabs.f64(double) #1
 
@@ -15,7 +15,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI: s_endpgm
 define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -47,7 +47,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI: s_endpgm
 define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -64,7 +64,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -77,7 +77,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b)
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -105,7 +105,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -119,7 +119,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -133,12 +133,12 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -151,12 +151,12 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %b = load i32, i32 addrspace(1)* %gep.in
 
-  %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -171,12 +171,12 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %b = load i32, i32 addrspace(1)* %gep.in
 
-  %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -187,11 +187,11 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -202,12 +202,12 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -218,12 +218,12 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -234,13 +234,13 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b)
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
 define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
 define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -276,7 +276,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -291,12 +291,12 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load double, double addrspace(1)* %in
 
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -307,12 +307,12 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
 define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %b = load i32, i32 addrspace(1)* %gep.in
 
-  %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -322,12 +322,12 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
 define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %b = load i32, i32 addrspace(1)* %gep.in
 
-  %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %gep.out, align 4
   ret void
@@ -339,13 +339,13 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
@@ -359,14 +359,14 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
 define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
-  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
   %or.0 = or i1 %class0, %class1
   %or.1 = or i1 %or.0, %class2
 
@@ -382,21 +382,21 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
 define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
-  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
-  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
-  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
-  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
-  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
-  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
-  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+  %class3 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
+  %class4 = call i1 @llvm.amdgcn.class.f32(float %a, i32 16) #1
+  %class5 = call i1 @llvm.amdgcn.class.f32(float %a, i32 32) #1
+  %class6 = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
+  %class7 = call i1 @llvm.amdgcn.class.f32(float %a, i32 128) #1
+  %class8 = call i1 @llvm.amdgcn.class.f32(float %a, i32 256) #1
+  %class9 = call i1 @llvm.amdgcn.class.f32(float %a, i32 512) #1
   %or.0 = or i1 %class0, %class1
   %or.1 = or i1 %or.0, %class2
   %or.2 = or i1 %or.1, %class3
@@ -417,13 +417,13 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
@@ -437,13 +437,13 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
@@ -457,13 +457,13 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: s_or_b64
 ; SI: s_endpgm
 define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.in
 
-  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
-  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+  %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
@@ -477,7 +477,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
@@ -489,7 +489,19 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
-  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Why is the extension still here?
+; SI-LABEL: {{^}}test_class_undef_f32:
+; SI-NOT: v_cmp_class
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
+; SI: buffer_store_dword
+define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
new file mode 100644
index 000000000000..f6495d8155f7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.cos.f32(float) #0
+
+; GCN-LABEL: {{^}}v_cos_f32:
+; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+  %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
+  store float %cos, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
new file mode 100644
index 000000000000..22bed45ee30f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubeid(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubeid:
+; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+  %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
new file mode 100644
index 000000000000..565f22c5d5b6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubema(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubema:
+; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+  %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
new file mode 100644
index 000000000000..a3ba32745814
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubesc(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubesc:
+; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+  %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
new file mode 100644
index 000000000000..d3c0f2851ead
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubetc(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubetc:
+; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+  %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index d96ea743f6ed..2e8625256f13 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: unsupported hsa intrinsic without hsa target in test
+; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
 
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index 55ca9c7536e5..f9b390eca0c2 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
-declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
-declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
+declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
+declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fixup_f32:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
@@ -17,7 +17,7 @@ declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readn
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -25,7 +25,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
 define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
-  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
+  %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 7dc094ed1b4b..efea3eb707a1 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
 ; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
 
 ; FIXME: Enable for VI.
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
-declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
@@ -21,7 +21,7 @@ declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind re
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -35,7 +35,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -45,11 +45,11 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a,
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -63,7 +63,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a,
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -71,7 +71,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a,
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
 define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
-  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
+  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
@@ -81,7 +81,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -90,7 +90,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f
 ; SI: s_mov_b64 vcc, 0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -99,7 +99,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f
 ; SI: s_mov_b64 vcc, -1
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
@@ -115,21 +115,21 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
 define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
   %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
 
-  %a = load float, float addrspace(1)* %gep.a
-  %b = load float, float addrspace(1)* %gep.b
-  %c = load float, float addrspace(1)* %gep.c
+  %a = load volatile float, float addrspace(1)* %gep.a
+  %b = load volatile float, float addrspace(1)* %gep.b
+  %c = load volatile float, float addrspace(1)* %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   %cmp1 = icmp ne i32 %d, 0
   %and = and i1 %cmp0, %cmp1
 
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
   store float %result, float addrspace(1)* %gep.out, align 4
   ret void
 }
@@ -146,13 +146,13 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 
 ; SI: BB9_2:
 ; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_i32_e32 vcc, 0, v0
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -172,7 +172,7 @@ bb:
 
 exit:
   %cond = phi i1 [false, %entry], [%cmp1, %bb]
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
+  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
   store float %result, float addrspace(1)* %gep.out, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index de830de039c7..38e4b8440d32 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,235 +1,235 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone
+declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
-; SI-LABEL @test_div_scale_f32_1:
+; SI-LABEL: {{^}}test_div_scale_f32_1:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_2:
+; SI-LABEL: {{^}}test_div_scale_f32_2:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_1:
+; SI-LABEL: {{^}}test_div_scale_f64_1:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load volatile double, double addrspace(1)* %gep.0, align 8
+  %b = load volatile double, double addrspace(1)* %gep.1, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_1:
+; SI-LABEL: {{^}}test_div_scale_f64_2:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load volatile double, double addrspace(1)* %gep.0, align 8
+  %b = load volatile double, double addrspace(1)* %gep.1, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_1:
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
 ; SI-DAG: s_load_dword [[A:s[0-9]+]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
   %b = load float, float addrspace(1)* %gep, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_2:
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
 ; SI-DAG: s_load_dword [[A:s[0-9]+]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
   %b = load float, float addrspace(1)* %gep, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_1:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; SI-DAG: s_load_dword [[B:s[0-9]+]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
   %a = load float, float addrspace(1)* %gep, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_2:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; SI-DAG: s_load_dword [[B:s[0-9]+]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
   %a = load float, float addrspace(1)* %gep, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_1:
 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
   %b = load double, double addrspace(1)* %gep, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_2:
 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
   %b = load double, double addrspace(1)* %gep, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_1:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
   %a = load double, double addrspace(1)* %gep, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_2:
 ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
   %a = load double, double addrspace(1)* %gep, align 8
 
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
@@ -237,13 +237,13 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
@@ -251,13 +251,13 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a,
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
@@ -266,13 +266,13 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a,
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
@@ -281,83 +281,83 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
-  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_inline_imm_num:
+; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_num:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_inline_imm_den:
+; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_den:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_fabs_num:
+; SI-LABEL: {{^}}test_div_scale_f32_fabs_num:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL @test_div_scale_f32_fabs_den:
+; SI-LABEL: {{^}}test_div_scale_f32_fabs_den:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
 
-  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
new file mode 100644
index 000000000000..92d3fc8b107e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
+
+; FUNC-LABEL: {{^}}ds_bpermute:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+  %index = add i32 %base_index, 4
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
new file mode 100644
index 000000000000..6d9c94191535
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
+
+; CHECK-LABEL: {{^}}ds_permute:
+; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+  %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ds_permute_imm_offset:
+; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+  %index = add i32 %base_index, 4
+  %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
new file mode 100644
index 000000000000..ef3cb00024bb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
+
+; FUNC-LABEL: {{^}}ds_swizzle:
+; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+  %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+  store i32 %swizzle, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
new file mode 100644
index 000000000000..1cca9eb6a77a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.fract.f32(float) #0
+declare double @llvm.amdgcn.fract.f64(double) #0
+
+; GCN-LABEL: {{^}}v_fract_f32:
+; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+  %fract = call float @llvm.amdgcn.fract.f32(float %src)
+  store float %fract, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fract_f64:
+; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+  %fract = call double @llvm.amdgcn.fract.f64(double %src)
+  store double %fract, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fract_undef_f32:
+; GCN-NOT: v_fract_f32
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+  %fract = call float @llvm.amdgcn.fract.f32(float undef)
+  store float %fract, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
new file mode 100644
index 000000000000..728a6b5cf26b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s  | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
+declare i32 @llvm.amdgcn.frexp.exp.f32(float) #0
+declare i32 @llvm.amdgcn.frexp.exp.f64(double) #0
+
+; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
+define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %fabs.src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
+define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %fneg.fabs.src = fsub float -0.0, %fabs.src
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %fneg.fabs.src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+  %fabs.src = call double @llvm.fabs.f64(double %src)
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %fabs.src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+  %fabs.src = call double @llvm.fabs.f64(double %src)
+  %fneg.fabs.src = fsub double -0.0, %fabs.src
+  %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %fneg.fabs.src)
+  store i32 %frexp.exp, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
new file mode 100644
index 000000000000..b8d63defffed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s  | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
+declare float @llvm.amdgcn.frexp.mant.f32(float) #0
+declare double @llvm.amdgcn.frexp.mant.f64(double) #0
+
+; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+  %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
+  store float %frexp.mant, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
+define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
+  store float %frexp.mant, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
+define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %fneg.fabs.src = fsub float -0.0, %fabs.src
+  %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
+  store float %frexp.mant, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+  %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
+  store double %frexp.mant, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+  %fabs.src = call double @llvm.fabs.f64(double %src)
+  %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
+  store double %frexp.mant, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+  %fabs.src = call double @llvm.fabs.f64(double %src)
+  %fneg.fabs.src = fsub double -0.0, %fabs.src
+  %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
+  store double %frexp.mant, double addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
new file mode 100644
index 000000000000..cf6d1ab237cd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [256 x float] undef, align 4
+
+; FUNC-LABEL: {{^}}groupstaticsize_test0:
+; CHECK: s_movk_i32 s{{[0-9]+}}, 0x800
+define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store float %val0, float addrspace(1)* %out, align 4
+
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}groupstaticsize_test1:
+; CHECK: s_movk_i32 s{{[0-9]+}}, 0xc00
+define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+entry:
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store float %val0, float addrspace(1)* %out, align 4
+  br label %endif
+
+else:                                             ; preds = %entry
+  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  store float %val1, float addrspace(1)* %out, align 4
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.groupstaticsize() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
new file mode 100644
index 000000000000..87d838727882
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -0,0 +1,123 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+;CHECK-LABEL: {{^}}image_atomic_swap:
+;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_swap_v2i32:
+;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00]
+;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32 %2, <2 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_swap_i32:
+;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00]
+;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.i32(i32 %2, i32 %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_cmpswap:
+;SI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x40,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: v_mov_b32_e32 v0, v4
+define amdgpu_ps float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32 %2, i32 %3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_add:
+;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_sub:
+;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %orig.f = bitcast i32 %orig to float
+  ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_unchanged:
+;CHECK: image_atomic_smin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x50,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_umin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x54,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_smax v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x58,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_umax v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_and v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x60,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_or v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x64,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_xor v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x68,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_inc v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_dec v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x70,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+  %t0 = call i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t1 = call i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32 %t0, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t2 = call i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32 %t1, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t3 = call i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32 %t2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t4 = call i32 @llvm.amdgcn.image.atomic.and.v4i32(i32 %t3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t5 = call i32 @llvm.amdgcn.image.atomic.or.v4i32(i32 %t4, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t6 = call i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32 %t5, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t7 = call i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32 %t6, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %t8 = call i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32 %t7, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+  %out = bitcast i32 %t8 to float
+  ret float %out
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.i32(i32, i32, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32, <2 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32, i32, <4 x i32>, <8 x i32>,i1, i1, i1) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.and.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.or.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
new file mode 100644
index 000000000000..f0d23b93119d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -0,0 +1,110 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load_v4i32:
+;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_v2i32:
+;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_i32:
+;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_1:
+;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %elt = extractelement <4 x float> %tex, i32 0
+; Only first component used, test that dmask etc. is changed accordingly
+  ret float %elt
+}
+
+;CHECK-LABEL: {{^}}image_store_v4i32:
+;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_v2i32:
+;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_i32:
+;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_mip:
+;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}image_store_wait:
+;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index a28e1b1eb241..911d7d9b74d9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -6,7 +6,7 @@
 ;GCN: s_mov_b32 m0, s{{[0-9]+}}
 ;GCN: v_interp_p1_f32
 ;GCN: v_interp_p2_f32
-define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
 main_body:
   %i = extractelement <2 x i32> %4, i32 0
   %j = extractelement <2 x i32> %4, i32 1
@@ -19,12 +19,11 @@ main_body:
 }
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #0
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
new file mode 100644
index 000000000000..07650d990f3c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=MESA -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}test:
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
+
+; MESA: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
+define void @test(i32 addrspace(1)* %out) #1 {
+  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(2)* %gep
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}test_implicit:
+; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
+; MESA: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
+define void @test_implicit(i32 addrspace(1)* %out) #1 {
+  %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(2)* %gep
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
new file mode 100644
index 000000000000..a23defd742a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.amdgcn.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+  %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+  %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_undef_f32:
+; SI-NOT: v_ldexp_f32
+define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+  %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
new file mode 100644
index 000000000000..014369b45015
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
+
+; GCN-LABEL: {{^}}v_lerp:
+; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+  %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
new file mode 100644
index 000000000000..f78257f1d226
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; ERR: intrinsic not supported on subtarget
+
+declare float @llvm.amdgcn.log.clamp.f32(float) #0
+
+; GCN-LABEL: {{^}}v_log_clamp_f32:
+; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+  %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
+  store float %log.clamp, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 02ee2039542a..4825c3a479c1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -6,7 +6,7 @@
 ;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
 
-define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
 main_body:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
new file mode 100644
index 000000000000..a85fc7e13fd8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
+
+; FIXME: The register allocator / scheduler should be able to avoid these hazards.
+
+; VI-LABEL: {{^}}dpp_test:
+; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; VI-LABEL: {{^}}dpp_wait_states:
+; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+
+; VI-LABEL: {{^}}dpp_first_in_bb:
+; VI: ; %endif
+; VI-OPT: s_mov_b32
+; VI-OPT: s_mov_b32
+; VI-NOOPT: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+  %cmp = fcmp oeq float %cond, 0.0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %out_val = load float, float addrspace(1)* %out
+  %if_val = fadd float %a, %out_val
+  br label %endif
+
+else:
+  %in_val = load float, float addrspace(1)* %in
+  %else_val = fadd float %b, %in_val
+  br label %endif
+
+endif:
+  %val = phi float [%if_val, %if], [%else_val, %else]
+  %val_i32 = bitcast float %val to i32
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp_float = bitcast i32 %tmp2 to float
+  store float %tmp_float, float addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
new file mode 100644
index 000000000000..fd1a463fd3e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+;
+; Note: We could generate better code here if we recognized earlier that
+; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
+; the expectation is that the intrinsic will be used in non-trivial shaders,
+; so such an optimization doesn't seem worth the effort.
+define amdgpu_ps float @test1() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+  %r = bitcast i32 %live.32 to float
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test2:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK: image_sample v0, [[VAR]],
+define amdgpu_ps float @test2() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test3:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
+; CHECK: ; %dead
+define amdgpu_ps float @test3(i32 %in) {
+entry:
+  %live = call i1 @llvm.amdgcn.ps.live()
+  br i1 %live, label %end, label %dead
+
+dead:
+  %tc.dead = mul i32 %in, 2
+  br label %end
+
+end:
+  %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+declare i1 @llvm.amdgcn.ps.live() #0
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
new file mode 100644
index 000000000000..6bf871543ca2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define void @test(i32 addrspace(1)* %out) {
+  %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+  %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
new file mode 100644
index 000000000000..825231bf8680
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rcp.f32(float) #0
+declare double @llvm.amdgcn.rcp.f64(double) #0
+
+declare double @llvm.sqrt.f64(double) #0
+declare float @llvm.sqrt.f32(float) #0
+
+; FUNC-LABEL: {{^}}rcp_undef_f32:
+; SI-NOT: v_rcp_f32
+define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+  %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_no_fp32_denormals_rcp_f32:
+; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_f32_denormals_rcp_pat_f32:
+; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
+; SI: v_div_scale_f32
+define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+  %sqrt = call float @llvm.sqrt.f32(float %src)
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
+; SI: v_rsq_f32_e32
+define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+  %sqrt = call float @llvm.sqrt.f32(float %src)
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rcp_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_div_scale_f64
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64:
+; SI-NOT: v_rsq_f64_e32
+; SI: v_sqrt_f64
+; SI: v_rcp_f64
+define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+  %sqrt = call double @llvm.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64:
+; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+  %sqrt = call double @llvm.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
+attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll
index 2e299e30b8c7..76a5757e4c20 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll
@@ -1,23 +1,19 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}read_workdim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].Z
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA %s
 
+; GCN-LABEL: {{^}}read_workdim:
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @read_workdim(i32 addrspace(1)* %out) {
 entry:
-  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  %0 = call i32 @llvm.amdgcn.read.workdim() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; GCN-LABEL: {{^}}read_workdim_known_bits:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
 ; GCN-NOT: 0xff
@@ -25,13 +21,26 @@ entry:
 ; GCN: buffer_store_dword [[VVAL]]
 define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
 entry:
-  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  %dim = call i32 @llvm.amdgcn.read.workdim() #0
   %shl = shl i32 %dim, 24
   %shr = lshr i32 %shl, 24
   store i32 %shr, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}legacy_read_workdim:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @legacy_read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %dim, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.read.workdim() #0
 declare i32 @llvm.AMDGPU.read.workdim() #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
new file mode 100644
index 000000000000..73a5c54e175e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
+declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
+
+; FUNC-LABEL: {{^}}rsq_clamp_f32:
+; SI: v_rsq_clamp_f32_e32
+
+; VI: s_load_dword [[SRC:s[0-9]+]]
+; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]]
+; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]]
+; VI: buffer_store_dword [[RESULT]]
+define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+  store float %rsq_clamp, float addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}rsq_clamp_f64:
+; SI: v_rsq_clamp_f64_e32
+
+; TODO: this constant should be folded:
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
+; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
+define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+  %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+  store double %rsq_clamp, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
+; SI-NOT: v_rsq_clamp_f32
+define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
+  store float %rsq_clamp, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
new file mode 100644
index 000000000000..47bd0d82b834
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.legacy(float) #0
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
+define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
+define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
+; SI-NOT: v_rsq_legacy_f32
+define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
new file mode 100644
index 000000000000..012f6cd82925
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.f32(float) #0
+declare double @llvm.amdgcn.rsq.f64(double) #0
+
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f64:
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+  %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
+  store double %rsq, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
+define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+  %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
+  store double %rsq, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f64_constant_100.0
+; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
+; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+  %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
+  store double %rsq, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_undef_f32:
+; SI-NOT: v_rsq_f32
+define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
new file mode 100644
index 000000000000..132e476d5e29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_barrier:
+; GCN: buffer_store_dword
+; GCN: s_waitcnt
+; GCN: s_barrier
+define void @test_barrier(i32 addrspace(1)* %out) #0 {
+entry:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
+  store i32 %tmp, i32 addrspace(1)* %tmp1
+  call void @llvm.amdgcn.s.barrier()
+  %tmp2 = call i32 @llvm.r600.read.local.size.x()
+  %tmp3 = sub i32 %tmp2, 1
+  %tmp4 = sub i32 %tmp3, %tmp
+  %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5
+  store i32 %tmp6, i32 addrspace(1)* %tmp1
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare i32 @llvm.r600.read.local.size.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index f8af67c17ec2..ecd4ac6824cc 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.inv() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv:
 ; GCN-NEXT: ; BB#0:
@@ -15,10 +16,11 @@ define void @test_s_dcache_inv() #0 {
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait:
 ; GCN-NEXT: ; BB#0:
-; GCN-NEXT: s_dcache_inv
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_dcache_inv
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index a8502a7c5033..097f35d42c4f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv_vol:
 ; GCN-NEXT: ; BB#0:
@@ -16,9 +17,10 @@ define void @test_s_dcache_inv_vol() #0 {
 ; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait:
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index f9ae09b391aa..9ecce7463f6b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.wb() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; VI-LABEL: {{^}}test_s_dcache_wb:
 ; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@ define void @test_s_dcache_wb() #0 {
 ; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait:
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index d9145458a1f6..943f8c67a2e3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.wb.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; VI-LABEL: {{^}}test_s_dcache_wb_vol:
 ; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@ define void @test_s_dcache_wb_vol() #0 {
 ; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait:
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
new file mode 100644
index 000000000000..251eec656edc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+; FUNC-LABEL: {{^}}s_getreg_test:
+; CHECK: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
+define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+  %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #0
+  %lds_size_bytes = shl i32 %lds_size_64dwords, 8
+  store i32 %lds_size_bytes, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.s.getreg(i32) #0
+
+attributes #0 = { nounwind readonly}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
new file mode 100644
index 000000000000..372cba6eb67b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.memrealtime() #0
+
+; GCN-LABEL: {{^}}test_s_memrealtime:
+; GCN-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+  %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
+  store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+  %cycle1 = call i64 @llvm.amdgcn.s.memrealtime()
+  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
new file mode 100644
index 000000000000..8ce2d48733c6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.memtime() #0
+
+; GCN-LABEL: {{^}}test_s_memtime:
+; GCN-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+  %cycle0 = call i64 @llvm.amdgcn.s.memtime()
+  store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+  %cycle1 = call i64 @llvm.amdgcn.s.memtime()
+  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
new file mode 100644
index 000000000000..870aa48a3417
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @llvm.amdgcn.s.sleep(i32) #0
+
+; GCN-LABEL: {{^}}test_s_sleep:
+; GCN: s_sleep 0{{$}}
+; GCN: s_sleep 1{{$}}
+; GCN: s_sleep 2{{$}}
+; GCN: s_sleep 3{{$}}
+; GCN: s_sleep 4{{$}}
+; GCN: s_sleep 5{{$}}
+; GCN: s_sleep 6{{$}}
+; GCN: s_sleep 7{{$}}
+; GCN: s_sleep 8{{$}}
+; GCN: s_sleep 9{{$}}
+; GCN: s_sleep 10{{$}}
+; GCN: s_sleep 11{{$}}
+; GCN: s_sleep 12{{$}}
+; GCN: s_sleep 13{{$}}
+; GCN: s_sleep 14{{$}}
+; GCN: s_sleep 15{{$}}
+define void @test_s_sleep(i32 %x) #0 {
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 1)
+  call void @llvm.amdgcn.s.sleep(i32 2)
+  call void @llvm.amdgcn.s.sleep(i32 3)
+  call void @llvm.amdgcn.s.sleep(i32 4)
+  call void @llvm.amdgcn.s.sleep(i32 5)
+  call void @llvm.amdgcn.s.sleep(i32 6)
+  call void @llvm.amdgcn.s.sleep(i32 7)
+
+  ; Values that might only work on VI
+  call void @llvm.amdgcn.s.sleep(i32 8)
+  call void @llvm.amdgcn.s.sleep(i32 9)
+  call void @llvm.amdgcn.s.sleep(i32 10)
+  call void @llvm.amdgcn.s.sleep(i32 11)
+  call void @llvm.amdgcn.s.sleep(i32 12)
+  call void @llvm.amdgcn.s.sleep(i32 13)
+  call void @llvm.amdgcn.s.sleep(i32 14)
+  call void @llvm.amdgcn.s.sleep(i32 15)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
new file mode 100644
index 000000000000..c2d48f99aac5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: image_store
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
+; CHECK-NEXT: image_store
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  ret void
+}
+
+; Test that the intrinsic is merged with automatically generated waits and
+; emitted as late as possible.
+;
+; CHECK-LABEL: {{^}}test2:
+; CHECK: image_load
+; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt
+; CHECK-NEXT: image_store
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
+  %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+  %c.1 = mul i32 %c, 2
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
new file mode 100644
index 000000000000..9dc4554b88a4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.sin.f32(float) #0
+
+; GCN-LABEL: {{^}}v_sin_f32:
+; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+  %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index 6b546a7e17c1..7757e411553b 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
+declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 
 ; SI-LABEL: {{^}}test_trig_preop_f64:
 ; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
@@ -12,7 +12,7 @@ declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
 define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
+  %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
@@ -24,7 +24,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
 ; SI: s_endpgm
 define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
-  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
+  %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
new file mode 100644
index 000000000000..c22eac7e271c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+; ALL-LABEL {{^}}test_workgroup_id_x:
+
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s6{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL {{^}}test_workgroup_id_y:
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s7{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL {{^}}test_workgroup_id_z:
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s7{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
new file mode 100644
index 000000000000..28ef7b82ef84
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=CI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 132{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_x:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; ALL-NOT: v0
+; ALL: {{buffer|flat}}_store_dword {{.*}}v0
+define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 2180{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_y:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
+
+; ALL-NOT: v1
+; ALL: {{buffer|flat}}_store_dword {{.*}}v1
+define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 4228{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_z:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
+
+; ALL-NOT: v2
+; ALL: {{buffer|flat}}_store_dword {{.*}}v2
+define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %id, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
index 42df6db1ccfd..6b865d8076e6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}kilp_gs_const:
 ; SI: s_mov_b64 exec, 0
-define void @kilp_gs_const() #0 {
+define amdgpu_gs void @kilp_gs_const() {
 main_body:
   %0 = icmp ule i32 0, 3
   %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
@@ -16,6 +16,4 @@ main_body:
 
 declare void @llvm.AMDGPU.kilp(float)
 
-attributes #0 = { "ShaderType"="2" }
-
 !0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
deleted file mode 100644
index 0c3e4ecaa1a0..000000000000
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_lrp:
-; SI: v_mad_f32
-; SI: v_mac_f32_e32
-define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
-  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
-  store float %mad, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
index c65df8b3e8da..eb7dcbbf2346 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
 
 ;FUNC-LABEL: test
 ;EG: MULADD_IEEE *
@@ -37,5 +37,3 @@ define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
 
 declare float @llvm.cos.f32(float) readnone
 declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index b01f8ab2bdf9..1a37ba311606 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -14,17 +14,16 @@ entry:
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind  }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 244715) (llvm/trunk 244718)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 244715) (llvm/trunk 244718)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
 !1 = !DIFile(filename: "/tmp/test_debug_value.cl", directory: "/Users/matt/src/llvm/build_debug")
 !2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !9)
+!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !9)
 !5 = !DISubroutineType(types: !6)
 !6 = !{null, !7}
 !7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 32)
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index d83ab562b718..8398309d7520 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -6,77 +6,77 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
 
 ; SI: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -87,41 +87,41 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
 }
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
 
 ; SI: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -153,15 +153,11 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; FIXME: Use 64-bit ops
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
 
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
+; SI: ds_read2_b64
+; SI: ds_read2_b64
 
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
+; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.pow.ll b/test/CodeGen/AMDGPU/llvm.pow.ll
index c4ae652619c2..3f203ddf93b8 100644
--- a/test/CodeGen/AMDGPU/llvm.pow.ll
+++ b/test/CodeGen/AMDGPU/llvm.pow.ll
@@ -5,12 +5,12 @@
 ;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
 
-define void @test1(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test1(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.pow.f32( float %r0, float %r1)
    %vec = insertelement <4 x float> undef, float %r2, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
@@ -27,14 +27,12 @@ define void @test1(<4 x float> inreg %reg0) #0 {
 ;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_ps void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
    %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.pow.f32(float ,float ) readonly
 declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 036cd2ca82a6..4db29c58385e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
 
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
 
 define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
   %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
   %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
-  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
   store float %dp4, float addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
new file mode 100644
index 000000000000..e4e6dd8e1069
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG  %s
+
+; EG-LABEL: {{^}}test_group_barrier:
+; EG: GROUP_BARRIER
+define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
+  store i32 %tmp, i32 addrspace(1)* %tmp1
+  call void @llvm.r600.group.barrier()
+  %tmp2 = call i32 @llvm.r600.read.local.size.x()
+  %tmp3 = sub i32 %tmp2, 1
+  %tmp4 = sub i32 %tmp3, %tmp
+  %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5
+  store i32 %tmp6, i32 addrspace(1)* %tmp1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.r600.group.barrier() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll
new file mode 100644
index 000000000000..2f5947395c43
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.r600.read.workdim() #0
+  store i32 %dim, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-LABEL: {{^}}read_workdim_known_bits:
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.r600.read.workdim() #0
+  %shl = shl i32 %dim, 24
+  %shr = lshr i32 %shl, 24
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-LABEL: {{^}}legacy_read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @legacy_read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %dim, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.workdim() #0
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
new file mode 100644
index 000000000000..1c6e7950e9b7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+
+declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
+
+; EG-LABEL: {{^}}rsq_clamped_f32:
+; EG: RECIPSQRT_CLAMPED
+define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
+  store float %rsq_clamped, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
new file mode 100644
index 000000000000..1d6bff01e662
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+
+declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
+
+; EG-LABEL: {{^}}recipsqrt.ieee_f32:
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+  %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
+  store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+  %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
+  store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+  %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
+  store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
new file mode 100644
index 000000000000..409037f3e976
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -0,0 +1,65 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+bb:
+  %addr = load <4 x float>, <4 x float> addrspace(1)* %in
+  %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp1 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp3 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp5 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp6 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp7 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+  %tmp10 = shufflevector <4 x float> %tmp9, <4 x float> %tmp9, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp11 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp12 = shufflevector <4 x float> %tmp11, <4 x float> %tmp11, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp13 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp14 = shufflevector <4 x float> %tmp13, <4 x float> %tmp13, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp15 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+  %tmp16 = shufflevector <4 x float> %tmp15, <4 x float> %tmp15, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+  %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp18 = shufflevector <4 x float> %tmp17, <4 x float> %tmp17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp20 = shufflevector <4 x float> %tmp19, <4 x float> %tmp19, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  %tmp21 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp22 = shufflevector <4 x float> %tmp21, <4 x float> %tmp21, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp23 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp24 = shufflevector <4 x float> %tmp23, <4 x float> %tmp23, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp25 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp26 = shufflevector <4 x float> %tmp25, <4 x float> %tmp25, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp28 = shufflevector <4 x float> %tmp27, <4 x float> %tmp27, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp29 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  store <4 x float> %tmp31, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
index 661db51ad032..cf7c0e4c6fb6 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}rint_f32:
 ; R600: RNDNE
@@ -43,18 +43,6 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.AMDIL.round.nearest.f32(float) #0
 declare float @llvm.rint.f32(float) #0
 declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 98afbeee93e6..a8024b713261 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -13,7 +13,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 
 ; FUNC-LABEL: {{^}}v_round_f64:
 ; SI: buffer_load_dwordx2
-; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+; SI-DAG: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
 
 ; SI-DAG: v_not_b32_e32
 ; SI-DAG: v_not_b32_e32
@@ -27,7 +27,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
   %x = load double, double addrspace(1)* %gep
@@ -60,7 +60,7 @@ define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 declare double @llvm.round.f64(double) #1
 declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index d0e49243ffa7..9b7bb00d3c38 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -5,9 +5,9 @@
 ; FUNC-LABEL: {{^}}round_f32:
 ; SI-DAG: s_load_dword [[SX:s[0-9]+]]
 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
-; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
-; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
-; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
 ; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
 ; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]|
 ; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
index 3bb245c2e249..04754396a0f7 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -1,8 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: sin_f32
 ; EG: MULADD_IEEE *
@@ -10,58 +7,91 @@
 ; EG: ADD *
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG-NOT: SIN
+
 ; SI: v_mul_f32
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-
 define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
 }
 
-; FUNC-LABEL: {{^}}sin_3x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ef47644
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}safe_sin_3x_f32:
+; SI: v_mul_f32
+; SI: v_mul_f32
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}sin_2x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}unsafe_sin_3x_f32:
+; SI-NOT: v_add_f32
+; SI: 0x3ef47644
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+  %y = fmul float 3.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_sin_2x_f32:
+; SI: v_add_f32
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 2.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_sin_2x_f32:
+; SI-NOT: v_add_f32
+; SI: 0x3ea2f983
+; SI: v_mul_f32
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_2sin_f32:
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}test_safe_2sin_f32:
+; SI: v_add_f32
+; SI: v_mul_f32
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %y = fmul float 2.0, %x
+   %sin = call float @llvm.sin.f32(float %y)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}test_unsafe_2sin_f32:
+; SI: 0x3ea2f983
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -74,19 +104,21 @@ define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG-NOT: SIN
+
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-
 define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
 }
 
-declare float @llvm.sin.f32(float) readnone
-declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
+declare float @llvm.sin.f32(float) #0
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) #0
 
-attributes #0 = { "ShaderType"="0" }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.ll b/test/CodeGen/AMDGPU/llvm.sqrt.ll
deleted file mode 100644
index c6da047f5392..000000000000
--- a/test/CodeGen/AMDGPU/llvm.sqrt.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
-
-; R600-LABEL: {{^}}sqrt_f32:
-; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-LABEL: {{^}}sqrt_f32:
-; SI: v_sqrt_f32_e32
-define void @sqrt_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.sqrt.f32(float %in)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v2f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-LABEL: {{^}}sqrt_v2f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v4f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-LABEL: {{^}}sqrt_v4f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check(float addrspace(1)* %out, float %in) {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp olt float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, float addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_ult:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp ult float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, float addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2_ult
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.sqrt.f32(float %in)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll
new file mode 100644
index 000000000000..f94a3785a685
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_f64:
+; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; GCN-NOHSA: buffer_store_dwordx2
+; GCN-HSA: flat_store_dwordx2
+define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+  %ld = load double, double addrspace(2)* %in
+  store double %ld, double addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll
new file mode 100644
index 000000000000..f15e4f484ffa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: buffer_store_byte
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+  %load = load i1, i1 addrspace(2)* %in
+  store i1 %load, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i1:
+define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+  store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i1:
+define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+  store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i1:
+define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+  store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i1:
+define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+  store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i1:
+define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+  store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v32i1:
+define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+  store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v64i1:
+define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+  store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_dword
+define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(2)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: buffer_store_dword
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(2)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
+define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
+define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
+define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
+define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
+define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+  %ext = zext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
+define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+  %ext = sext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
+define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
+define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
+define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
+define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
+define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
+define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
+define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+  %ext = zext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
+define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+  %ext = sext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
+define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+  %ext = zext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
+define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+  %ext = sext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i1_to_i64:
+; GCN-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
+; GCN: buffer_store_dwordx2
+define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(2)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i1_to_i64:
+; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: buffer_store_dwordx2
+define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+  %a = load i1, i1 addrspace(2)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
+define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
+define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
+define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
+define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
+define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+  %ext = zext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
+define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+  %ext = sext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
+define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
+define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
+define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
+define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
+define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
+define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
+define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+  %ext = zext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
+define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+  %ext = sext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
+define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+  %ext = zext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
+define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+  %ext = sext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
new file mode 100644
index 000000000000..ef9791d8f7a1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -0,0 +1,441 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i16:
+; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %ld = load i16, i16 addrspace(2)* %in
+  store i16 %ld, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i16:
+; GCN: s_load_dword s
+
+; EG: VTX_READ_32
+define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i16:
+; GCN: s_load_dwordx2 s
+
+; EG-DAG: VTX_READ_32
+; EG-DAG: VTX_READ_16
+define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i16:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i16:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i16:
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_ushort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %a = load i16, i16 addrspace(2)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_sshort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 16
+define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %a = load i16, i16 addrspace(2)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-HSA: flat_load_sshort
+define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}}
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_zextload_v3i16_to_v3i32:
+; GCN: s_load_dwordx2
+define void @constant_constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  %ext = zext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32:
+; GCN: s_load_dwordx2
+define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+  %ext = sext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32:
+; GCN: s_load_dwordx4
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32:
+; GCN: s_load_dwordx4
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32:
+; GCN: s_load_dwordx8
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32:
+; GCN: s_load_dwordx8
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
+; GCN-DAG: s_load_dwordx16
+; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32:
+; GCN: s_load_dwordx16
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i32:
+define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %a = load i16, i16 addrspace(2)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+  %a = load i16, i16 addrspace(2)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i64:
+define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i64:
+define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
+define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
+define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
+define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
+define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
+define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
+define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i64:
+define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i64:
+define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i64:
+define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i64:
+define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
+; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+;   %ext = zext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
+; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+;   %ext = sext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll
new file mode 100644
index 000000000000..40c29be60548
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -0,0 +1,380 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i32:
+; GCN: s_load_dword s{{[0-9]+}}
+
+; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+entry:
+  %ld = load i32, i32 addrspace(2)* %in
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i32:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+  store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i32:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
+  store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i32:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i32:
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+  store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i32:
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+  store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i32_to_i64:
+; GCN-DAG: s_load_dword s[[SLO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[SHI:[0-9]+]], 0{{$}}
+; GCN: store_dwordx2
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG: CF_END
+; EG: VTX_READ_32
+define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+  %ld = load i32, i32 addrspace(2)* %in
+  %ext = zext i32 %ld to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i32_to_i64:
+; GCN: s_load_dword s[[SLO:[0-9]+]]
+; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[SLO]], 31
+; GCN: store_dwordx2
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG: CF_END
+; EG: VTX_READ_32
+; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
+; EG: 31
+define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+  %ld = load i32, i32 addrspace(2)* %in
+  %ext = sext i32 %ld to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
+; GCN: s_load_dword
+; GCN: store_dwordx2
+define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+  %ext = zext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i32_to_v1i64:
+; GCN: s_load_dword s[[LO:[0-9]+]]
+; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
+; GCN: store_dwordx2
+define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+  %ext = sext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
+; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN: store_dwordx4
+define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+  %ext = zext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i32_to_v2i64:
+; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_ashr_i32
+
+; GCN: store_dwordx4
+define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+  %ext = sext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i32_to_v4i64:
+; GCN: s_load_dwordx4
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %ext = zext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i32_to_v4i64:
+; GCN: s_load_dwordx4
+
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %ext = sext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i32_to_v8i64:
+; GCN: s_load_dwordx8
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-SA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+  %ext = zext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i32_to_v8i64:
+; GCN: s_load_dwordx8
+
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+  %ext = sext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i32_to_v16i64:
+; GCN: s_load_dwordx16
+
+
+; GCN-DAG: s_ashr_i32
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+  %ext = sext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i32_to_v16i64
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+  %ext = zext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i32_to_v32i64:
+
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+  %ext = sext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i32_to_v32i64:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+  %ext = zext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll
new file mode 100644
index 000000000000..e4656a2b2ac6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}constant_load_i64:
+; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; EG: VTX_READ_64
+define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+  %ld = load i64, i64 addrspace(2)* %in
+  store i64 %ld, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i64:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
+  store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i64:
+; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+
+; EG-DAG: VTX_READ_128
+; EG-DAG: VTX_READ_128
+define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
+  store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i64
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
+  store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i64:
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
+  store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i64:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
+  store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
new file mode 100644
index 000000000000..87828982a987
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -0,0 +1,567 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}constant_load_i8:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+entry:
+  %ld = load i8, i8 addrspace(2)* %in
+  store i8 %ld, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i8:
+; GCN-NOHSA: buffer_load_ushort v
+; GCN-HSA: flat_load_ushort v
+
+; EG: VTX_READ_16
+define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i8:
+; GCN: s_load_dword s
+
+; EG-DAG: VTX_READ_32
+define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i8:
+; GCN: s_load_dword s
+
+; EG: VTX_READ_32
+define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i8:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i8:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %a = load i8, i8 addrspace(2)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_sbyte
+; GCN-HSA: flat_load_sbyte
+
+; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 8
+define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %ld = load i8, i8 addrspace(2)* %in
+  %ext = sext i8 %ld to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
+define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i32:
+define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+
+; GCN-HSA: flat_load_ushort
+
+; GCN: v_bfe_i32
+; GCN: v_bfe_i32
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:
+; GCN: s_load_dword s
+
+; GCN-DAG: s_bfe_u32
+; GCN-DAG: s_bfe_u32
+; GCN-DAG: s_and_b32
+define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  %ext = zext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:
+; GCN: s_load_dword s
+
+; GCN-DAG: s_bfe_i32
+; GCN-DAG: s_bfe_i32
+; GCN-DAG: s_bfe_i32
+define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+  %ext = sext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_sext_i32_i8
+; GCN-DAG: s_ashr_i32
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i8
+define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i32:
+define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i32:
+define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i32:
+define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = zext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i32:
+define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = sext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i32:
+define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+  %ext = zext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i32:
+define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+  %ext = sext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+
+; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %a = load i8, i8 addrspace(2)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:
+; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %a = load i8, i8 addrspace(2)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i64:
+define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i64:
+define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
+define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
+define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
+define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
+define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
+define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
+define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
+define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
+define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i64:
+define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = zext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i64:
+define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = sext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
+; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
+; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+
+; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %a = load i8, i8 addrspace(2)* %in
+  %ext = zext i8 %a to i16
+  store i16 %ext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],
+
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+  %a = load i8, i8 addrspace(2)* %in
+  %ext = sext i8 %a to i16
+  store i16 %ext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
+define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = zext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i16:
+define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+  %ext = sext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
+define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = zext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i16:
+define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+  %ext = sext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
+define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = zext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i16:
+define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+  %ext = sext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
+define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = zext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i16:
+define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+  %ext = sext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
+define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = zext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i16:
+define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+  %ext = sext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i16:
+define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = zext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i16:
+define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+  %ext = sext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
+; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+;   %ext = zext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
+; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+;   %ext = sext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
new file mode 100644
index 000000000000..23f4a6079e81
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_f32:
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
+; GCN-HSA: flat_load_dword
+
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load float, float addrspace(1)* %in
+  store float %tmp0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2f32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; R600: VTX_READ_64
+define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
+  store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
+  store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
+  store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
+  store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+entry:
+  %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
+  store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
new file mode 100644
index 000000000000..a86cc5a6d3d4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_f64:
+; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-NOHSA: buffer_store_dwordx2 [[VAL]]
+
+; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
+define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %ld = load double, double addrspace(1)* %in
+  store double %ld, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <2 x double>, <2 x double> addrspace(1)* %in
+  store <2 x double> %ld, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x double>, <3 x double> addrspace(1)* %in
+  store <3 x double> %ld, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <4 x double>, <4 x double> addrspace(1)* %in
+  store <4 x double> %ld, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <8 x double>, <8 x double> addrspace(1)* %in
+  store <8 x double> %ld, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <16 x double>, <16 x double> addrspace(1)* %in
+  store <16 x double> %ld, <16 x double> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll
new file mode 100644
index 000000000000..ebfec781087e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: buffer_store_byte
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %load = load i1, i1 addrspace(1)* %in
+  store i1 %load, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i1:
+define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i1:
+define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+  store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i1:
+define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i1:
+define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i1:
+define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v32i1:
+define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+  store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v64i1:
+define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+  store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_dword
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: buffer_store_dword
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
+define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
+define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
+define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
+define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
+define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+  %ext = zext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
+define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+  %ext = sext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
+define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
+define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
+define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
+define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
+define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
+define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
+define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+  %ext = zext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
+define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+  %ext = sext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
+define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+  %ext = zext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
+define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+  %ext = sext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
+; GCN-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
+; GCN: buffer_store_dwordx2
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
+; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: buffer_store_dwordx2
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
+define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
+define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
+define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
+define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
+define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+  %ext = zext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
+define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+  %ext = sext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
+define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
+define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
+define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
+define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
+define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
+define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
+define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+  %ext = zext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
+define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+  %ext = sext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
+define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+  %ext = zext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
+define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+  %ext = sext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
new file mode 100644
index 000000000000..11e6b10c38ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -0,0 +1,476 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
+
+; FUNC-LABEL: {{^}}global_load_i16:
+; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %ld = load i16, i16 addrspace(1)* %in
+  store i16 %ld, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i16:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG: VTX_READ_32
+define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i16:
+; GCN-NOHSA: buffer_load_dwordx2 v
+; GCN-HSA: flat_load_dwordx2 v
+
+; EG-DAG: VTX_READ_32
+; EG-DAG: VTX_READ_16
+define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i16:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i16:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i16:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_ushort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_sshort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 16
+define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-HSA: flat_load_sshort
+define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i32:
+; GCN-NOHSA: buffer_load_dword
+
+; GCN-HSA: flat_load_dword
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+  %ext = zext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_sextload_v3i16_to_v3i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+  %ext = sext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32:
+; GCN-NOHSA: buffer_load_dwordx2
+
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i32:
+; GCN-NOHSA: buffer_load_dwordx2
+
+; GCN-HSA: flat_load_dwordx2
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32:
+define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32:
+define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64:
+define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64:
+define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
+define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
+define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
+define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
+define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
+define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
+define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64:
+define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64:
+define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64:
+define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64:
+define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
+; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+;   %ext = zext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
+; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+;   %ext = sext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
new file mode 100644
index 000000000000..5e1171a69be5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -0,0 +1,521 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}global_load_i32:
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
+; GCN-HSA: flat_load_dword
+
+; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+  %ld = load i32, i32 addrspace(1)* %in
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i32_to_i64:
+; GCN-NOHSA-DAG: buffer_load_dword v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_dword v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %ld = load i32, i32 addrspace(1)* %in
+  %ext = zext i32 %ld to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i32_to_i64:
+; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-HSA: flat_load_dword v[[LO:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+
+
+; EG: MEM_RAT
+; EG: VTX_READ_32
+; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
+; EG: 31
+define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %ld = load i32, i32 addrspace(1)* %in
+  %ext = sext i32 %ld to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i32_to_v1i64:
+; GCN-NOHSA: buffer_load_dword
+; GCN-NOHSA: buffer_store_dwordx2
+
+; GCN-HSA: flat_load_dword
+; GCN-HSA: flat_store_dwordx2
+define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = zext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i32_to_v1i64:
+; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-HSA: flat_load_dword v[[LO:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = sext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i32_to_v2i64:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx2
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = zext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i32_to_v2i64:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = sext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i32_to_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = zext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i32_to_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = sext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i32_to_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-SA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = zext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i32_to_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = sext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i32_to_v16i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = sext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i32_to_v16i64
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = zext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i32_to_v32i64:
+
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = sext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i32_to_v32i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = zext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
new file mode 100644
index 000000000000..305b954c78f9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -0,0 +1,122 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_i64:
+; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-NOHSA: buffer_store_dwordx2 [[VAL]]
+
+; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
+
+; EG: VTX_READ_64
+define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %ld = load i64, i64 addrspace(1)* %in
+  store i64 %ld, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
+  store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
+  store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
+  store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
new file mode 100644
index 000000000000..b697967f1a23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -0,0 +1,564 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}global_load_i8:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+entry:
+  %ld = load i8, i8 addrspace(1)* %in
+  store i8 %ld, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i8:
+; GCN-NOHSA: buffer_load_ushort v
+; GCN-HSA: flat_load_ushort v
+
+; EG: VTX_READ_16
+define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i8:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG-DAG: VTX_READ_32
+define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i8:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG: VTX_READ_32
+define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i8:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i8:
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_sbyte
+; GCN-HSA: flat_load_sbyte
+
+; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 8
+define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %ld = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %ld to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
+define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i32:
+define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i8_to_v3i32:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
+define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %ext = zext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i8_to_v3i32:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %ext = sext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i32:
+define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i32:
+define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i32:
+define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i32:
+define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i32:
+define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = zext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i32:
+define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = sext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i32:
+define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+  %ext = zext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i32:
+define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+  %ext = sext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+
+; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i64:
+; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i64:
+define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i64:
+define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
+define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
+define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
+define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
+define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
+define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
+define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
+define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
+define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i64:
+define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = zext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i64:
+define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = sext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
+; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
+; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+
+; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i16
+  store i16 %ext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],
+
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i16
+  store i16 %ext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
+define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i16:
+define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
+define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i16:
+define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
+define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i16:
+define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
+define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i16:
+define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
+define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i16:
+define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i16:
+define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = zext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i16:
+define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+  %ext = sext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
+; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
+; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll
deleted file mode 100644
index 0ca49fde3e7b..000000000000
--- a/test/CodeGen/AMDGPU/load-i1.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  store i1 %load, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
-; SI: ds_read_u8
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: ds_write_b8
-; SI: s_endpgm
-
-; EG: LDS_UBYTE_READ_RET
-; EG: AND_INT
-; EG: LDS_BYTE_WRITE
-define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
-  %load = load i1, i1 addrspace(3)* %in
-  store i1 %load, i1 addrspace(3)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
-  %load = load i1, i1 addrspace(2)* %in
-  store i1 %load, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = sext i1 %load to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
-  %ext = zext i1 %load to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
-  store i1 %x, i1 addrspace(1)* %out, align 1
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = zext i1 %x to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i64:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = zext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = sext i1 %x to i32
-  store i32 %ext, i32addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
-  %ext = sext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
index 1daf0e6527b9..b1899a45bf56 100644
--- a/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -88,14 +88,14 @@ main_body:
   %83 = insertelement <4 x float> %82, float %75, i32 1
   %84 = insertelement <4 x float> %83, float %77, i32 2
   %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
-  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
+  %86 = call float @llvm.r600.dot4(<4 x float> %81, <4 x float> %85)
   %87 = insertelement <4 x float> undef, float %86, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %87, i32 2, i32 2)
   ret void
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
 declare float @fabs(float) #2
@@ -104,14 +104,13 @@ declare float @fabs(float) #2
 declare float @llvm.AMDGPU.rsq(float) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
 
 ; Function Attrs: nounwind readonly
 declare float @llvm.pow.f32(float, float) #3
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="1" }
 attributes #1 = { readnone }
 attributes #2 = { readonly }
 attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/load-local-f32.ll b/test/CodeGen/AMDGPU/load-local-f32.ll
new file mode 100644
index 000000000000..77b5e3cf3aed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_f32_local:
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load float, float addrspace(3)* %in
+  store float %tmp0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2f32_local:
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
+  store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: should this do a read2_b64?
+; FUNC-LABEL: {{^}}local_load_v3f32:
+; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN: s_waitcnt
+; GCN-DAG: ds_write_b64
+; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
+  store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4f32:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
+  store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8f32:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
+  store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16f32:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+entry:
+  %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
+  store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-f64.ll b/test/CodeGen/AMDGPU/load-local-f64.ll
new file mode 100644
index 000000000000..27d39b7e9d7d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_f64:
+; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+  %ld = load double, double addrspace(3)* %in
+  store double %ld, double addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2f64:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <2 x double>, <2 x double> addrspace(3)* %in
+  store <2 x double> %ld, <2 x double> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3f64:
+; GCN-DAG: ds_read2_b64
+; GCN-DAG: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x double>, <3 x double> addrspace(3)* %in
+  store <3 x double> %ld, <3 x double> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <4 x double>, <4 x double> addrspace(3)* %in
+  store <4 x double> %ld, <4 x double> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <8 x double>, <8 x double> addrspace(3)* %in
+  store <8 x double> %ld, <8 x double> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <16 x double>, <16 x double> addrspace(3)* %in
+  store <16 x double> %ld, <16 x double> addrspace(3)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i1.ll b/test/CodeGen/AMDGPU/load-local-i1.ll
new file mode 100644
index 000000000000..2eed9917b5e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i1:
+; GCN: ds_read_u8
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: ds_write_b8
+
+; EG: LDS_UBYTE_READ_RET
+; EG: AND_INT
+; EG: LDS_BYTE_WRITE
+define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+  %load = load i1, i1 addrspace(3)* %in
+  store i1 %load, i1 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i1:
+define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+  store <2 x i1> %load, <2 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i1:
+define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+  store <3 x i1> %load, <3 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i1:
+define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+  store <4 x i1> %load, <4 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i1:
+define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+  store <8 x i1> %load, <8 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i1:
+define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+  store <16 x i1> %load, <16 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v32i1:
+define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+  store <32 x i1> %load, <32 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v64i1:
+define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+  store <64 x i1> %load, <64 x i1> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
+; GCN: ds_read_u8
+; GCN: ds_write_b32
+define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+  %a = load i1, i1 addrspace(3)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i1_to_i32:
+; GCN: ds_read_u8
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: ds_write_b32
+
+; EG: LDS_UBYTE_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+  %a = load i1, i1 addrspace(3)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
+define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
+define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
+define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
+define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
+define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+  %ext = zext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
+define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+  %ext = sext <3 x i1> %load to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
+define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
+define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
+define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
+define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
+define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
+define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
+define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+  %ext = zext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
+define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+  %ext = sext <32 x i1> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
+define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+  %ext = zext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
+define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+  %ext = sext <64 x i1> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i1_to_i64:
+; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN: ds_write_b64
+define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+  %a = load i1, i1 addrspace(3)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i1_to_i64:
+; GCN: ds_read_u8 [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: ds_write_b64
+define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+  %a = load i1, i1 addrspace(3)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
+define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
+define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
+define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
+define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
+define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+  %ext = zext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
+define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+  %ext = sext <3 x i1> %load to <3 x i64>
+  store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
+define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
+define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
+define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
+define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
+define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
+define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
+define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+  %ext = zext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
+define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+  %ext = sext <32 x i1> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
+define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+  %ext = zext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
+define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+  %ext = sext <64 x i1> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
new file mode 100644
index 000000000000..d3c0af469dd2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -0,0 +1,454 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i16:
+; GCN: ds_read_u16 v{{[0-9]+}}
+
+; EG: LDS_USHORT_READ_RET
+define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+entry:
+  %ld = load i16, i16 addrspace(3)* %in
+  store i16 %ld, i16 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i16:
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i16:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b16
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_READ_RET
+define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+  store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i16:
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i16:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
+  store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i16:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
+  store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
+; GCN: ds_read_u16
+; GCN: ds_write_b32
+
+; EG: LDS_USHORT_READ_RET
+define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+  %a = load i16, i16 addrspace(3)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_i16
+
+; EG: LDS_USHORT_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+  %a = load i16, i16 addrspace(3)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
+; GCN: ds_read_u16
+define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
+; GCN: ds_read_i16
+define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b64
+define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+  %ext = zext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b64
+define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+  %ext = sext <3 x i16> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FIXME: Should have 2 ds_read_b64
+; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:1{{$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
+define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FIXME: Missed read2
+; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
+define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
+define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
+; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+  %a = load i16, i16 addrspace(3)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
+; GCN: ds_read_i16 v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+  %a = load i16, i16 addrspace(3)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
+define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
+define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
+define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
+define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
+define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
+define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
+define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
+define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
+define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
+define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
+define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
+define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
+; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+;   %ext = zext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
+; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+;   %ext = sext <64 x i16> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll
new file mode 100644
index 000000000000..d68a8518e2ed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -0,0 +1,182 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_load_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0, -1
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+entry:
+  %ld = load i32, i32 addrspace(3)* %in
+  store i32 %ld, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i32:
+; GCN: ds_read_b64
+define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+  store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i32:
+; GCN-DAG: ds_read_b64
+; GCN-DAG: ds_read_b32
+define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
+  store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+  store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+  store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+  store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
+define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+  %ld = load i32, i32 addrspace(3)* %in
+  %ext = zext i32 %ld to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
+define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+  %ld = load i32, i32 addrspace(3)* %in
+  %ext = sext i32 %ld to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
+define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+  %ext = zext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
+define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+  %ext = sext <1 x i32> %ld to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
+define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+  %ext = zext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
+define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+  %ext = sext <2 x i32> %ld to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
+define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+  %ext = zext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
+define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+  %ext = sext <4 x i32> %ld to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
+define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+  %ext = zext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
+define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+  %ext = sext <8 x i32> %ld to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
+define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+  %ext = sext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
+define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+  %ext = zext <16 x i32> %ld to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
+define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+  %ext = sext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
+define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+  %ext = zext <32 x i32> %ld to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i64.ll b/test/CodeGen/AMDGPU/load-local-i64.ll
new file mode 100644
index 000000000000..180807df7b9a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i64:
+; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+  %ld = load i64, i64 addrspace(3)* %in
+  store i64 %ld, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i64:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
+  store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i64:
+; GCN-DAG: ds_read2_b64
+; GCN-DAG: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
+  store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
+  store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
+  store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
+  store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
new file mode 100644
index 000000000000..be865b078d74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -0,0 +1,556 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_load_i8:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u8
+
+; EG: LDS_UBYTE_READ_RET
+define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+entry:
+  %ld = load i8, i8 addrspace(3)* %in
+  store i8 %ld, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i8:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u16
+
+; EG: LDS_USHORT_READ_RET
+define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i8:
+; GCN: ds_read_b32
+
+; EG: DS_READ_RET
+define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+  store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i8:
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i8:
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i8:
+; GCN: ds_read2_b64  v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset0:1{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u8
+
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %a = load i8, i8 addrspace(3)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_i8
+
+; EG: LDS_UBYTE_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %ld = load i8, i8 addrspace(3)* %in
+  %ext = sext i8 %ld to i32
+  store i32 %ext, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
+define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
+define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
+; GCN: ds_read_u16
+
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u16
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
+; GCN: ds_read_b32
+
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
+define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+  %ext = zext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+
+; GCN-DAG: ds_write_b64
+; GCN-DAG: ds_write_b32
+
+define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+  %ext = sext <3 x i8> %ld to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
+define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
+define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
+define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
+define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
+define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = zext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
+define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = sext <32 x i8> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
+define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+  %ext = zext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
+define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+  %ext = sext <64 x i8> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %a = load i8, i8 addrspace(3)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
+; GCN: ds_read_i8 v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %a = load i8, i8 addrspace(3)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
+define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
+define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
+define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
+define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
+define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
+define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
+define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
+define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
+define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
+define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
+define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = zext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
+define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = sext <32 x i8> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
+; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
+; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
+; GCN: ds_read_u8 v[[VAL:[0-9]+]],
+; GCN: ds_write_b16 v[[VAL:[0-9]+]]
+define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %a = load i8, i8 addrspace(3)* %in
+  %ext = zext i8 %a to i16
+  store i16 %ext, i16 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
+; GCN: ds_read_i8 v[[VAL:[0-9]+]],
+; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
+define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+  %a = load i8, i8 addrspace(3)* %in
+  %ext = sext i8 %a to i16
+  store i16 %ext, i16 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
+define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = zext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
+define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+  %ext = sext <1 x i8> %load to <1 x i16>
+  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
+define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = zext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
+define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %ext = sext <2 x i8> %load to <2 x i16>
+  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
+define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = zext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
+define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %ext = sext <4 x i8> %load to <4 x i16>
+  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
+define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = zext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
+define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+  %ext = sext <8 x i8> %load to <8 x i16>
+  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
+define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = zext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
+define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+  %ext = sext <16 x i8> %load to <16 x i16>
+  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
+define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = zext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
+define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+  %ext = sext <32 x i8> %load to <32 x i16>
+  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
+; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+;   %ext = zext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
+; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+;   %ext = sext <64 x i8> %load to <64 x i16>
+;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+;   ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
new file mode 100644
index 000000000000..b9f7018b8107
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_i24:
+; SI: {{flat|buffer}}_load_ubyte
+; SI: {{flat|buffer}}_load_ushort
+; SI: {{flat|buffer}}_store_dword
+define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+  %1 = load i24, i24 addrspace(1)* %in
+  %2 = zext i24 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i25:
+; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOHSA: buffer_store_dword [[VAL]]
+
+; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
+; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
+define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+  %1 = load i25, i25 addrspace(1)* %in
+  %2 = zext i25 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
deleted file mode 100644
index 6486c6ab2ffc..000000000000
--- a/test/CodeGen/AMDGPU/load.ll
+++ /dev/null
@@ -1,737 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
-
-;===------------------------------------------------------------------------===;
-; GLOBAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the global address space.
-; FUNC-LABEL: {{^}}load_i8:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte
-define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %1 = load i8, i8 addrspace(1)* %in
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(1)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %1 = zext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  %1 = sext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %1 = zext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  %1 = sext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Load an i16 value from the global address space.
-; FUNC-LABEL: {{^}}load_i16:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
-  %0 = load i16	, i16	 addrspace(1)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(1)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %1 = zext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  %1 = sext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %1 = zext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  %1 = sext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load an i32 value from the global address space.
-; FUNC-LABEL: {{^}}load_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
-; CI-HSA: flat_load_dword
-define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; load a f32 value from the global address space.
-; FUNC-LABEL: {{^}}load_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
-; CI-HSA: flat_load_dword
-define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
-  %0 = load float, float addrspace(1)* %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; load a v2f32 value from the global address space
-; FUNC-LABEL: {{^}}load_v2f32:
-; R600: MEM_RAT
-; R600: VTX_READ_64
-; SI-NOHSA: buffer_load_dwordx2
-; CI-HSA: flat_load_dwordx2
-define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
-entry:
-  %0 = load <2 x float>, <2 x float> addrspace(1)* %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64:
-; R600: VTX_READ_64
-; SI-NOHSA: buffer_load_dwordx2
-; CI-HSA: flat_load_dwordx2
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-entry:
-  %0 = load i64, i64 addrspace(1)* %in
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_sext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
-; R600: 31
-; SI-NOHSA: buffer_load_dword
-; CI-HSA: flat_load_dword
-
-define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %1 = sext i32 %0 to i64
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_zext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %1 = zext i32 %0 to i64
-  store i64 %1, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v8i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
-entry:
-  %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
-  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v16i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
-entry:
-  %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
-  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
-  ret void
-}
-
-;===------------------------------------------------------------------------===;
-; CONSTANT ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load a sign-extended i8 value
-; FUNC-LABEL: {{^}}load_const_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}},
-; CI-HSA: flat_load_sbyte v{{[0-9]+}},
-define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(2)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_aligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(2)* %in
-  %1 = zext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an un-aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_unaligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
-  %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
-  %1 = load i8, i8 addrspace(2)* %0
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load a sign-extended i16 value
-; FUNC-LABEL: {{^}}load_const_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(2)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_aligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(2)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an un-aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_unaligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
-  %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
-  %1 = load i16, i16 addrspace(2)* %0
-  %2 = zext i16 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load an i32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(2)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Load a f32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
-  %1 = load float, float addrspace(2)* %in
-  store float %1, float addrspace(1)* %out
-  ret void
-}
-
-;===------------------------------------------------------------------------===;
-; LOCAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the local address space.
-; FUNC-LABEL: {{^}}load_i8_local:
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-  %1 = load i8, i8 addrspace(3)* %in
-  %2 = zext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-entry:
-  %0 = load i8, i8 addrspace(3)* %in
-  %1 = sext i8 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
-  %1 = zext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
-  %1 = sext <2 x i8> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
-  %1 = zext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
-  %1 = sext <4 x i8> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; Load an i16 value from the local address space.
-; FUNC-LABEL: {{^}}load_i16_local:
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
-  %0 = load i16	, i16	 addrspace(3)* %in
-  %1 = zext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext_local:
-; R600: LDS_USHORT_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
-  %0 = load i16, i16 addrspace(3)* %in
-  %1 = sext i16 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
-  %1 = zext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
-  %1 = sext <2 x i16> %0 to <2 x i32>
-  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
-  %1 = zext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
-  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
-  %1 = sext <4 x i16> %0 to <4 x i32>
-  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load an i32 value from the local address space.
-; FUNC-LABEL: {{^}}load_i32_local:
-; R600: LDS_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %0 = load i32, i32 addrspace(3)* %in
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; load a f32 value from the local address space.
-; FUNC-LABEL: {{^}}load_f32_local:
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
-entry:
-  %0 = load float, float addrspace(3)* %in
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; load a v2f32 value from the local address space
-; FUNC-LABEL: {{^}}load_v2f32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b64
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
-entry:
-  %0 = load <2 x float>, <2 x float> addrspace(3)* %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; Test loading a i32 and v2i32 value from the same base pointer.
-; FUNC-LABEL: {{^}}load_i32_v2i32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
-  %scalar = load i32, i32 addrspace(3)* %in
-  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
-  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
-  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
-  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
-  %vec = add <2 x i32> %vec0, %vec1
-  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; On SI we need to make sure that the base offset is a register and not
-; an immediate.
-; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; SI: ds_read_b32 v0, v[[ZERO]] offset:4
-; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
-  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
-  %tmp1 = load i32, i32 addrspace(3)* %tmp0
-  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp1, i32 addrspace(1)* %tmp2
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/load.vec.ll b/test/CodeGen/AMDGPU/load.vec.ll
deleted file mode 100644
index 02f883cd8e9c..000000000000
--- a/test/CodeGen/AMDGPU/load.vec.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG  %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI  %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI  %s
-
-; load a v2i32 value from the global address space.
-; EG: {{^}}load_v2i32:
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v2i32:
-; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  store <2 x i32> %a, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; load a v4i32 value from the global address space.
-; EG: {{^}}load_v4i32:
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v4i32:
-; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
-define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/load64.ll b/test/CodeGen/AMDGPU/load64.ll
deleted file mode 100644
index 74beabdc0076..000000000000
--- a/test/CodeGen/AMDGPU/load64.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; load a f64 value from the global address space.
-; CHECK-LABEL: {{^}}load_f64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %1 = load double, double addrspace(1)* %in
-  store double %1, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: {{^}}load_i64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tmp = load i64, i64 addrspace(1)* %in
-  store i64 %tmp, i64 addrspace(1)* %out, align 8
-  ret void
-}
-
-; Load a f64 value from the constant address space.
-; CHECK-LABEL: {{^}}load_const_addrspace_f64:
-; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
-  %1 = load double, double addrspace(2)* %in
-  store double %1, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index 33f3159d13eb..f63d6e08ef73 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -122,8 +122,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14
 ; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
@@ -133,8 +132,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
 ; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -143,10 +141,8 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28
 ; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
@@ -156,10 +152,8 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
 ; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
index 2aaf977ab903..ce82ff5475bc 100644
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
 ; EG: LDS_WRXCHG_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
@@ -31,8 +31,8 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
@@ -68,35 +68,35 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32:
 ; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_offset:
 ; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_bad_si_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -126,23 +126,23 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32:
 ; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32_offset:
 ; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -324,7 +324,6 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
 ; GCN: s_load_dword [[SPTR:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
@@ -357,30 +356,30 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
-; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_bad_si_offset:
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -405,20 +404,20 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index 0ffa5e751b7d..34be6511a602 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -10,7 +10,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -20,7 +20,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
 define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -29,11 +29,11 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
@@ -45,29 +45,29 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
-; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
+; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -76,7 +76,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -86,29 +86,29 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
-; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -117,7 +117,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -127,7 +127,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -136,7 +136,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -146,7 +146,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -155,7 +155,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -166,14 +166,14 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -182,7 +182,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -192,7 +192,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -201,7 +201,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -211,7 +211,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -220,7 +220,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -230,7 +230,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -239,7 +239,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -249,7 +249,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -257,7 +257,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -266,7 +266,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
 define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -274,7 +274,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
 ; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
@@ -288,26 +288,26 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
-; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
+; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -315,7 +315,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -324,26 +324,26 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
-; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
+; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -351,7 +351,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -360,7 +360,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -368,7 +368,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -377,7 +377,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -385,7 +385,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -395,13 +395,13 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
 
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -409,7 +409,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -418,7 +418,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -426,7 +426,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -435,7 +435,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -443,7 +443,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -452,7 +452,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -460,7 +460,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   ret void
 }
 
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
deleted file mode 100644
index 6b52b80ba082..000000000000
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
-
-@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-
-
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 8
-; GCN: .long 47180
-; GCN-NEXT: .long 32900
-
-; EG: {{^}}local_memory_two_objects:
-
-; We would like to check the lds writes are using different
-; addresses, but due to variations in the scheduler, we can't do
-; this consistently on evergreen GPUs.
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-
-; Make sure the lds reads are using different addresses, at different
-; constant offsets.
-; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
-
-define void @local_memory_two_objects(i32 addrspace(1)* %out) {
-entry:
-  %x.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
-  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
-  %mul = shl nsw i32 %x.i, 1
-  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
-  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
-  %sub = sub nsw i32 3, %x.i
-  call void @llvm.AMDGPU.barrier.local()
-  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
-  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
-  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
-  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
-  %add = add nsw i32 %x.i, 4
-  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
-  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
-  ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
new file mode 100644
index 000000000000..f6c0e3c62390
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; SI: .long 47180
+; SI-NEXT: .long 65668
+; CI: .long 47180
+; CI-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory:
+
+; GCN-NOT: s_wqm_b64
+; GCN: ds_write_b32
+
+; GCN: s_barrier
+
+; GCN: ds_read_b32 {{v[0-9]+}},
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+  %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.amdgcn.s.barrier()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory_two_objects:
+; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
+
+; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
+
+; SI-DAG: ds_write_b32 [[ADDRW]],
+; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+
+; GCN: s_barrier
+
+; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
+
+; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+entry:
+  %x.i = call i32 @llvm.amdgcn.workitem.id.x()
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.amdgcn.s.barrier()
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 9ffb59e70920..1a11332f865d 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -1,49 +1,44 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
+@lds = addrspace(3) global [512 x i32] undef, align 4
 
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 128
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
+; On SI we need to make sure that the base offset is a register and
+; not an immediate.
 
-; FUNC-LABEL: {{^}}local_memory:
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
 
-; EG: LDS_WRITE
-; SI-NOT: s_wqm_b64
-; SI: ds_write_b32
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-; SI: s_barrier
-
-; EG: LDS_READ_RET
-; SI: ds_read_b32 {{v[0-9]+}},
-
-define void @local_memory(i32 addrspace(1)* %out) {
+; R600: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
-  %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
-  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
-  %add = add nsw i32 %y.i, 1
-  %cmp = icmp eq i32 %add, 16
-  %.add = select i1 %cmp, i32 0, i32 %add
-  call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
-  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
-  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32, i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+  %scalar = load i32, i32 addrspace(3)* %in
+  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+  %vec = add <2 x i32> %vec0, %vec1
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  ret void
+}
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/local-memory.r600.ll b/test/CodeGen/AMDGPU/local-memory.r600.ll
new file mode 100644
index 000000000000..9841b8882b39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -0,0 +1,87 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 128
+
+; FUNC-LABEL: {{^}}local_memory:
+
+; EG: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; EG: LDS_READ_RET
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.r600.group.barrier()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; FUNC-LABEL: {{^}}local_memory_two_objects:
+
+; We would like to check the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses, at different
+; constant offsets.
+; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.r600.group.barrier()
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare void @llvm.r600.group.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
new file mode 100644
index 000000000000..6e6f289f5d6d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+
+; This used to fail due to a v_add_i32 instruction with an illegal immediate
+; operand that was created during Local Stack Slot Allocation. Test case derived
+; from https://bugs.freedesktop.org/show_bug.cgi?id=96602
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
+; CHECK: v_mov_b32_e32 [[HI_CONST:v[0-9]+]], 0x200
+; CHECK: v_mov_b32_e32 [[LO_CONST:v[0-9]+]], 0
+; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, [[BYTES]], [[HI_CONST]]
+; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, [[BYTES]], [[LO_CONST]]
+; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+define amdgpu_ps float @main(i32 %idx) {
+main_body:
+  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+  %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
+  %r = fadd float %v1, %v2
+  ret float %r
+}
diff --git a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
new file mode 100644
index 000000000000..e1fad13e0b51
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-unknown < %s | FileCheck %s
+
+; and can be eliminated
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
+; CHECK-NOT: v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
+define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+  %and = and i32 %id, 1023
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
+; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+  %and = and i32 %id, 511
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
+; CHECK-NOT: v0
+; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+  %and = and i32 %id, 255
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{i32 0, i32 1024}
+!1 = !{i32 0, i32 1023}
diff --git a/test/CodeGen/AMDGPU/m0-spill.ll b/test/CodeGen/AMDGPU/m0-spill.ll
index 1dddc85f775d..2427c8de34f8 100644
--- a/test/CodeGen/AMDGPU/m0-spill.ll
+++ b/test/CodeGen/AMDGPU/m0-spill.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK-NOT: v_readlane_b32 m0
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
 main_body:
   %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
   %cmp = fcmp ueq float 0.0, %4
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index c98f851f2b93..0e6281940c24 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -8,7 +8,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.fma.f32(float, float, float) #0
 declare float @llvm.fmuladd.f32(float, float, float) #0
@@ -32,15 +32,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
 define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %mul = fmul float %a, %b
   %fma = fadd float %mul, %c
@@ -71,7 +71,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -79,17 +79,17 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
+  %d = load volatile float, float addrspace(1)* %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fadd float %mul, %c
   %fma1 = fadd float %mul, %d
 
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, float addrspace(1)* %gep.out.0
+  store volatile float %fma1, float addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -108,15 +108,15 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
 define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %mul = fmul float %a, %b
   %fma = fadd float %c, %mul
@@ -138,15 +138,15 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp
 
 ; SI: buffer_store_dword [[RESULT]]
 define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %mul = fmul float %a, %b
   %fma = fsub float %mul, %c
@@ -175,7 +175,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -183,16 +183,16 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
+  %d = load volatile float, float addrspace(1)* %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fsub float %mul, %c
   %fma1 = fsub float %mul, %d
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, float addrspace(1)* %gep.out.0
+  store volatile float %fma1, float addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -210,15 +210,15 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
 
 ; SI: buffer_store_dword [[RESULT]]
 define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %mul = fmul float %a, %b
   %fma = fsub float %c, %mul
@@ -246,7 +246,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -254,16 +254,16 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
+  %d = load volatile float, float addrspace(1)* %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fsub float %c, %mul
   %fma1 = fsub float %d, %mul
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, float addrspace(1)* %gep.out.0
+  store volatile float %fma1, float addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -282,15 +282,15 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 
 ; SI: buffer_store_dword [[RESULT]]
 define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %mul = fmul float %a, %b
   %mul.neg = fsub float -0.0, %mul
@@ -320,7 +320,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -328,18 +328,18 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
+  %d = load volatile float, float addrspace(1)* %gep.3
 
   %mul = fmul float %a, %b
   %mul.neg = fsub float -0.0, %mul
   %fma0 = fsub float %mul.neg, %c
   %fma1 = fsub float %mul.neg, %d
 
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, float addrspace(1)* %gep.out.0
+  store volatile float %fma1, float addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -363,7 +363,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -371,18 +371,18 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
   %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
-  %d = load float, float addrspace(1)* %gep.3
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
+  %d = load volatile float, float addrspace(1)* %gep.3
 
   %mul = fmul float %a, %b
   %mul.neg = fsub float -0.0, %mul
   %fma0 = fsub float %mul.neg, %c
   %fma1 = fsub float %mul, %d
 
-  store float %fma0, float addrspace(1)* %gep.out.0
-  store float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, float addrspace(1)* %gep.out.0
+  store volatile float %fma1, float addrspace(1)* %gep.out.1
   ret void
 }
 
@@ -408,7 +408,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -416,11 +416,11 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
+  %z = load volatile float, float addrspace(1)* %gep.2
+  %u = load volatile float, float addrspace(1)* %gep.3
+  %v = load volatile float, float addrspace(1)* %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
@@ -454,7 +454,7 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
 define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -462,11 +462,11 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
+  %z = load volatile float, float addrspace(1)* %gep.2
+  %u = load volatile float, float addrspace(1)* %gep.3
+  %v = load volatile float, float addrspace(1)* %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
@@ -491,8 +491,8 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
 
@@ -500,7 +500,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 ; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
 define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -508,11 +508,11 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
+  %z = load volatile float, float addrspace(1)* %gep.2
+  %u = load volatile float, float addrspace(1)* %gep.3
+  %v = load volatile float, float addrspace(1)* %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
@@ -538,15 +538,15 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
 define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -554,11 +554,11 @@ define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %o
   %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %x = load float, float addrspace(1)* %gep.0
-  %y = load float, float addrspace(1)* %gep.1
-  %z = load float, float addrspace(1)* %gep.2
-  %u = load float, float addrspace(1)* %gep.3
-  %v = load float, float addrspace(1)* %gep.4
+  %x = load volatile float, float addrspace(1)* %gep.0
+  %y = load volatile float, float addrspace(1)* %gep.1
+  %z = load volatile float, float addrspace(1)* %gep.2
+  %u = load volatile float, float addrspace(1)* %gep.3
+  %v = load volatile float, float addrspace(1)* %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
index 24ff23a4cfc1..7fcfe7f53f06 100644
--- a/test/CodeGen/AMDGPU/mad-sub.ll
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
 
 ; FUNC-LABEL: {{^}}mad_sub_f32:
@@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) #0
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -18,9 +18,9 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   store float %sub, float addrspace(1)* %outgep, align 4
@@ -34,7 +34,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -42,9 +42,9 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %mul = fmul float %a, %b
   %sub = fsub float %c, %mul
   store float %sub, float addrspace(1)* %outgep, align 4
@@ -55,7 +55,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
 ; SI: v_mul_f64
 ; SI: v_add_f64
 define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -63,9 +63,9 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
-  %a = load double, double addrspace(1)* %gep0, align 8
-  %b = load double, double addrspace(1)* %gep1, align 8
-  %c = load double, double addrspace(1)* %gep2, align 8
+  %a = load volatile double, double addrspace(1)* %gep0, align 8
+  %b = load volatile double, double addrspace(1)* %gep1, align 8
+  %c = load volatile double, double addrspace(1)* %gep2, align 8
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   store double %sub, double addrspace(1)* %outgep, align 8
@@ -79,7 +79,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
 ; SI: buffer_store_dword [[RESULT]]
 define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -87,9 +87,9 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %c.abs = call float @llvm.fabs.f32(float %c) #0
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c.abs
@@ -104,7 +104,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
 ; SI: buffer_store_dword [[RESULT]]
 define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -112,9 +112,9 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %c.abs = call float @llvm.fabs.f32(float %c) #0
   %mul = fmul float %a, %b
   %sub = fsub float %c.abs, %mul
@@ -125,7 +125,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 ; FUNC-LABEL: {{^}}neg_neg_mad_f32:
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -133,9 +133,9 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %nega = fsub float -0.000000e+00, %a
   %negb = fsub float -0.000000e+00, %b
   %mul = fmul float %nega, %negb
@@ -151,7 +151,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
@@ -159,9 +159,9 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
   %add2 = add i64 %tid.ext, 2
   %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
   %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
-  %a = load float, float addrspace(1)* %gep0, align 4
-  %b = load float, float addrspace(1)* %gep1, align 4
-  %c = load float, float addrspace(1)* %gep2, align 4
+  %a = load volatile float, float addrspace(1)* %gep0, align 4
+  %b = load volatile float, float addrspace(1)* %gep1, align 4
+  %c = load volatile float, float addrspace(1)* %gep2, align 4
   %b.abs = call float @llvm.fabs.f32(float %b) #0
   %mul = fmul float %a, %b.abs
   %sub = fsub float %mul, %c
@@ -175,13 +175,13 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 ; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
 ; SI: buffer_store_dword [[R2]]
 define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %add = fadd float %r1, %r1
   %r3 = fsub float %r2, %add
@@ -196,13 +196,13 @@ define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in)
 ; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float, float addrspace(1)* %gep.0
-  %r2 = load float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, float addrspace(1)* %gep.0
+  %r2 = load volatile float, float addrspace(1)* %gep.1
 
   %add = fadd float %r1, %r1
   %r3 = fsub float %add, %r2
diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
new file mode 100644
index 000000000000..9183ae0972dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; If the workgroup id range is restricted, we should be able to use
+; mad24 for the usual indexing pattern.
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+; GCN-LABEL: {{^}}get_global_id_0:
+; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
+; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
+; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
+define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
+  %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0
+  %workgroup.size.x = and i32 %workgroup.size.xy, 65535
+
+  %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+  %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x(), !range !2
+
+  %mul = mul i32 %workgroup.id.x, %workgroup.size.x
+  %add = add i32 %mul, %workitem.id.x
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!0 = !{}
+!1 = !{i32 0, i32 1024}
+!2 = !{i32 0, i32 16777216}
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
index 86d75a63ca40..f177608a62fc 100644
--- a/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,9 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
 
 ; FUNC-LABEL: {{^}}i32_mad24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
@@ -24,12 +22,3 @@ entry:
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
-
-; FUNC-LABEL: @test_imul24
-; SI: v_mad_i32_i24
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
-  %add = add i32 %mul, %src2
-  store i32 %add, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 95fe34119596..72c6b2b26173 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}u32_mad24:
 ; EG: MULADD_UINT24
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 2e90cf10a3b5..6ea1202ac500 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -3,7 +3,7 @@
 
 ; FIXME: Enable VI
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; GCN-LABEL: {{^}}madak_f32:
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
 define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -47,17 +47,17 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
   %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %in.gep.0, align 4
-  %b = load float, float addrspace(1)* %in.gep.1, align 4
-  %c = load float, float addrspace(1)* %in.gep.2, align 4
+  %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
+  %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
 
   %mul0 = fmul float %a, %b
   %mul1 = fmul float %a, %c
   %madak0 = fadd float %mul0, 10.0
   %madak1 = fadd float %mul1, 10.0
 
-  store float %madak0, float addrspace(1)* %out.gep.0, align 4
-  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4
   ret void
 }
 
@@ -65,7 +65,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
 define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -85,7 +85,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -101,13 +101,13 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 
 ; We can't use an SGPR when forming madak
 ; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
 define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -126,7 +126,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
 define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -154,7 +154,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
 define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -176,7 +176,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
 define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -191,3 +191,32 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float
   store float %madak, float addrspace(1)* %out.gep, align 4
   ret void
 }
+
+; SIFoldOperands should not fold the SGPR copy into the instruction
+; because the implicit immediate already uses the constant bus.
+; GCN-LABEL: {{^}}madak_constant_bus_violation:
+; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
+; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
+; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
+; GCN: buffer_store_dword [[MUL]]
+define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+bb:
+  %tmp = icmp eq i32 %arg1, 0
+  br i1 %tmp, label %bb3, label %bb4
+
+bb3:
+  store volatile float 0.0, float addrspace(1)* undef
+  br label %bb4
+
+bb4:
+  %vgpr = load volatile float, float addrspace(1)* undef
+  %tmp0 = fmul float %sgpr0, 0.5
+  %tmp1 = fadd float %tmp0, 42.0
+  %tmp2 = fmul float %tmp1, %vgpr
+  store volatile float %tmp2, float addrspace(1)* undef, align 4
+  ret void
+}
+
+attributes #0 = { nounwind}
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index f8e14e34af67..1adf82402b72 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -1,21 +1,25 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+ ; FIXME: None of these trigger madmk emission anymore. It is still
+ ; possible, but requires the correct registers to be used which is
+ ; hard to trigger.
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; GCN-LABEL: {{^}}madmk_f32:
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
 define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %mul = fmul float %a, 10.0
   %madmk = fadd float %mul, %b
@@ -32,7 +36,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
 define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -41,9 +45,9 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
   %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
 
-  %a = load float, float addrspace(1)* %in.gep.0, align 4
-  %b = load float, float addrspace(1)* %in.gep.1, align 4
-  %c = load float, float addrspace(1)* %in.gep.2, align 4
+  %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
+  %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
 
   %mul0 = fmul float %a, 10.0
   %mul1 = fmul float %a, 10.0
@@ -61,13 +65,13 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
 define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %mul = fmul float %a, 4.0
   %madmk = fadd float %mul, %b
@@ -80,7 +84,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
 define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
   %mul = fmul float %a, 10.0
@@ -94,7 +98,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b)
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
 define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -110,7 +114,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
 define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %b = load float, float addrspace(1)* %gep.0, align 4
@@ -126,13 +130,13 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
 
@@ -147,13 +151,13 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
 define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep.0, align 4
-  %b = load float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
 
   %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
 
@@ -168,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
 define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -182,7 +186,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad
 
 ; SI-LABEL: {{^}}kill_madmk_verifier_error:
 ; SI: s_xor_b64
-; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
+; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
 ; SI: s_or_b64
 define void @kill_madmk_verifier_error() nounwind {
 bb:
@@ -193,7 +197,9 @@ bb1:                                              ; preds = %bb2
 
 bb2:                                              ; preds = %bb6, %bb
   %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
-  %tmp3 = fsub float undef, %tmp
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+  %f_tid = bitcast i32 %tid to float
+  %tmp3 = fsub float %f_tid, %tmp
   %tmp5 = fcmp oeq float %tmp3, 1.000000e+04
   br i1 %tmp5, label %bb1, label %bb6
 
@@ -203,3 +209,7 @@ bb6:                                              ; preds = %bb2
   %tmp8 = fadd float %tmp7, undef
   br label %bb2
 }
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/max-literals.ll b/test/CodeGen/AMDGPU/max-literals.ll
index c357524b140f..3f80d5e41a3f 100644
--- a/test/CodeGen/AMDGPU/max-literals.ll
+++ b/test/CodeGen/AMDGPU/max-literals.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: ADD *
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -23,16 +23,16 @@ main_body:
   %15 = insertelement <4 x float> %14, float %8, i32 3
   %16 = insertelement <4 x float> %15, float %11, i32 3
 
-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
   %18 = insertelement <4 x float> undef, float %17, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 2)
   ret void
 }
 
 ; CHECK-LABEL: {{^}}main2:
 ; CHECK-NOT: ADD *
 
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -52,16 +52,15 @@ main_body:
   %15 = insertelement <4 x float> %14, float %8, i32 3
   %16 = insertelement <4 x float> %15, float %11, i32 3
 
-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
   %18 = insertelement <4 x float> undef, float %17, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 2)
   ret void
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="1" }
 attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index eeb915c10a96..5fa307be0fd5 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -1,19 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
 ; SI: v_max_i32_e32
+
+; EG: MAX_INT
 define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -22,21 +20,25 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: v_max_i32_e32
 ; SI: v_max_i32_e32
 ; SI: v_max_i32_e32
+
+; These could be merged into one
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
 define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %out, i32 %tid
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %gep0, align 4
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 4
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
   %cmp = icmp sge <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
-  store <4 x i32> %val, <4 x i32> addrspace(1)* %outgep, align 4
+  store <4 x i32> %val, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: @s_test_imax_sge_i32
 ; SI: s_max_i32
+
+; EG: MAX_INT
 define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -46,6 +48,8 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
@@ -57,21 +61,21 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; SI: buffer_load_sbyte
 ; SI: buffer_load_sbyte
 ; SI: v_max_i32_e32
+
+; EG: MAX_INT
 define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0, align 1
-  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %a = load i8, i8 addrspace(1)* %aptr, align 1
+  %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp sge i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  store i8 %val, i8 addrspace(1)* %out, align 1
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
@@ -82,29 +86,33 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i32:
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
+
 ; FUNC-LABEL: @v_test_imax_sgt_i32
 ; SI: v_max_i32_e32
+
+; EG: MAX_INT
 define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: @s_test_imax_sgt_i32
 ; SI: s_max_i32
+
+; EG: MAX_INT
 define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -114,21 +122,21 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; FUNC-LABEL: @v_test_umax_uge_i32
 ; SI: v_max_u32_e32
+
+; EG: MAX_UINT
 define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: @s_test_umax_uge_i32
 ; SI: s_max_u32
+
+; EG: MAX_UINT
 define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -142,6 +150,11 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: s_max_u32
 ; SI-NOT: s_max_u32
 ; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+; EG: MAX_UINT
+; EG-NOT: MAX_UINT
 define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
@@ -153,36 +166,34 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <
 ; SI: buffer_load_ubyte
 ; SI: buffer_load_ubyte
 ; SI: v_max_u32_e32
+
+; EG: MAX_UINT
 define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0, align 1
-  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %a = load i8, i8 addrspace(1)* %aptr, align 1
+  %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp uge i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  store i8 %val, i8 addrspace(1)* %out, align 1
   ret void
 }
 
 ; FUNC-LABEL: @v_test_umax_ugt_i32
 ; SI: v_max_u32_e32
+
+; EG: MAX_UINT
 define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_test_umax_ugt_i32:
 ; SI: s_max_u32
+
+; EG: MAX_UINT
 define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -191,8 +202,11 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 }
 
 ; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32:
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+
+; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
+; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
@@ -205,8 +219,10 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
+
+; EG: MAX_UINT
 define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
@@ -223,8 +239,10 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
+
+; EG: MAX_INT
 define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
@@ -242,9 +260,60 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16
 ; SI: s_sext_i32_i16
 ; SI: s_sext_i32_i16
 ; SI: s_max_i32
+
+; EG: MAX_INT
 define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
   ret void
 }
+
+; 64 bit
+; FUNC-LABEL: {{^}}test_umax_ugt_i64
+; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp ugt i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umax_uge_i64
+; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp uge i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_imax_sgt_i64
+; SI: s_endpgm
+
+; EG-DAG: MAX_UINT
+; EG-DAG: MAX_INT
+define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp sgt i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_imax_sge_i64
+; SI: s_endpgm
+
+; EG-DAG: MAX_UINT
+; EG-DAG: MAX_INT
+define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp sge i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index cfb94b272e51..a12dba2eb6e9 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imax3_sgt_i32
 ; SI: v_max3_i32
 define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; FUNC-LABEL: @v_test_umax3_ugt_i32
 ; SI: v_max3_u32
 define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 65b454b5d8cb..17b4af818f8f 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -231,8 +231,8 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
@@ -334,8 +334,8 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
@@ -376,7 +376,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
   %w = load i32, i32 addrspace(1)* %in.gep.3
 
   ; Make sure the barrier doesn't stop this
-  tail call void @llvm.AMDGPU.barrier.local() #1
+  tail call void @llvm.amdgcn.s.barrier() #1
 
   store i32 %w, i32 addrspace(1)* %out.gep.3
   store i32 %z, i32 addrspace(1)* %out.gep.2
@@ -413,7 +413,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
   %w = load i32, i32 addrspace(1)* %in.gep.3
 
   ; Make sure the barrier doesn't stop this
-  tail call void @llvm.AMDGPU.barrier.local() #1
+  tail call void @llvm.amdgcn.s.barrier() #1
 
   store i32 %w, i32 addrspace(1)* %out
   store i32 %z, i32 addrspace(1)* %out.gep.1
@@ -640,13 +640,13 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 
 ; GCN-LABEL: {{^}}copy_v3i32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 
 ; GCN: ScratchSize: 0{{$}}
 define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
@@ -657,13 +657,13 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 
 ; GCN-LABEL: {{^}}copy_v3i64_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
 define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
@@ -673,13 +673,13 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 
 ; GCN-LABEL: {{^}}copy_v3f32_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: ScratchSize: 0{{$}}
 define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
@@ -690,13 +690,13 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 
 ; GCN-LABEL: {{^}}copy_v3f64_align4:
 ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN-NOT: offen
 ; GCN: s_waitcnt vmcnt
 ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
 define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
@@ -705,7 +705,7 @@ define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x dou
   ret void
 }
 
-declare void @llvm.AMDGPU.barrier.local() #1
+declare void @llvm.amdgcn.s.barrier() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 215dbeb4b2fd..5d64a152af3c 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,24 +1,25 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; SI: v_min_i32_e32
+
+; EG: MIN_INT
 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
 ; SI: s_min_i32
+
+; EG: MIN_INT
 define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -28,6 +29,8 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
 ; SI: s_min_i32
+
+; EG: MIN_INT
 define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -40,6 +43,11 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; SI: s_min_i32
 ; SI: s_min_i32
 ; SI: s_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
 define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
@@ -79,6 +87,11 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
 ; SI: v_min_i32
 
 ; SI: s_endpgm
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
 define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -91,6 +104,11 @@ define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
 define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -100,21 +118,21 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
 
 ; FUNC-LABEL: @v_test_imin_slt_i32
 ; SI: v_min_i32_e32
+
+; EG: MIN_INT
 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: @s_test_imin_slt_i32
 ; SI: s_min_i32
+
+; EG: MIN_INT
 define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -125,6 +143,9 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
 ; SI: s_min_i32
 ; SI: s_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
 define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
@@ -134,6 +155,8 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
 
 ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
 ; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+
+; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
@@ -143,6 +166,8 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
 ; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+
+; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
@@ -152,16 +177,14 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 
 ; FUNC-LABEL: @v_test_umin_ule_i32
 ; SI: v_min_u32_e32
+
+; EG: MIN_UINT
 define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -171,20 +194,22 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: v_min_u32_e32
 ; SI-NOT: v_min_u32_e32
 ; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
 define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep0
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep1
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr
   %cmp = icmp ule <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
-  store <3 x i32> %val, <3 x i32> addrspace(1)* %outgep
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %out
   ret void
 }
 ; FUNC-LABEL: @s_test_umin_ule_i32
 ; SI: s_min_u32
+
+; EG: MIN_UINT
 define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -194,16 +219,14 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; FUNC-LABEL: @v_test_umin_ult_i32
 ; SI: v_min_u32_e32
+
+; EG: MIN_UINT
 define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -211,21 +234,21 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: buffer_load_ubyte
 ; SI: buffer_load_ubyte
 ; SI: v_min_u32_e32
+
+; EG: MIN_UINT
 define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0, align 1
-  %b = load i8, i8 addrspace(1)* %gep1, align 1
+  %a = load i8, i8 addrspace(1)* %aptr, align 1
+  %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp ult i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %outgep, align 1
+  store i8 %val, i8 addrspace(1)* %out, align 1
   ret void
 }
 
 ; FUNC-LABEL: @s_test_umin_ult_i32
 ; SI: s_min_u32
+
+; EG: MIN_UINT
 define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -239,24 +262,23 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
+
+; EG-NOT: MIN_UINT
 define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
-  %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %outgep0, align 4
-  store i1 %cmp, i1 addrspace(1)* %outgep1
+  store i32 %val, i32 addrspace(1)* %out0, align 4
+  store i1 %cmp, i1 addrspace(1)* %out1
   ret void
 }
 
 
 ; FUNC-LABEL: @s_test_umin_ult_v1i32
 ; SI: s_min_u32
+
+; EG: MIN_UINT
 define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -273,6 +295,15 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; SI: s_min_u32
 ; SI: s_min_u32
 ; SI: s_min_u32
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
 define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
@@ -289,6 +320,15 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 ; SI: v_min_u32
 ; SI: v_min_u32
 ; SI: v_min_u32
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
 define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
@@ -301,8 +341,10 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
+; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI: buffer_store_dword [[VMIN]]
+
+; EG: MIN_UINT
 define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
@@ -319,8 +361,10 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
+; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI: buffer_store_dword [[VMIN]]
+
+; EG: MIN_INT
 define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
@@ -334,9 +378,60 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
 ; SI: s_min_i32
+
+; EG: MIN_INT
 define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
   ret void
 }
+
+; 64 bit
+; FUNC-LABEL: {{^}}test_umin_ult_i64
+; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp ult i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umin_ule_i64
+; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp ule i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_imin_slt_i64
+; SI: s_endpgm
+
+; EG-DAG: MIN_UINT
+; EG-DAG: MIN_INT
+define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp slt i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_imin_sle_i64
+; SI: s_endpgm
+
+; EG-DAG: MIN_UINT
+; EG-DAG: MIN_INT
+define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %tmp = icmp sle i64 %a, %b
+  %val = select i1 %tmp, i64 %a, i64 %b
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 38ef46d1bdd6..728479ad9f62 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imin3_slt_i32
 ; SI: v_min3_i32
 define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; FUNC-LABEL: @v_test_umin3_ult_i32
 ; SI: v_min3_u32
 define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -44,7 +44,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; SI: v_min_i32
 ; SI: v_min3_i32
 define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -78,7 +78,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
 ; FUNC-LABEL: @v_test_umin3_2_uses
 ; SI-NOT: v_min3
 define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
index 4af9cdf1b960..3d6d7fae0fd6 100644
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -7,8 +7,12 @@
 
 ; FUNC-LABEL: {{^}}missing_store_reduced:
 ; SI: ds_read_b64
-; SI: buffer_store_dword
-; SI: buffer_load_dword
+; SI-DAG: buffer_store_dword
+; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_load_dword
+; SI: s_nop 2
+; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index e9f641b736d5..36f12573c173 100644
--- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -6,12 +6,13 @@
 ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
 
 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
-; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
 
 ; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
+; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
-; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
 ; GCN-NOT: v_mov_b32
 
 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 8bca0575ecd2..1a0a39027853 100644
--- a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -7,12 +7,12 @@
 ; Check that moving the pointer out of the resource descriptor to
 ; vaddr works for atomics.
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
 define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
   %xor = xor i32 %tid, 1
@@ -32,7 +32,7 @@ exit:
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
 define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
   %xor = xor i32 %tid, 1
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index b19163f294e0..a574365da986 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
 
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
 
 ;;;==========================================================================;;;
 ;;; MUBUF LOAD TESTS
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
@@ -19,7 +19,7 @@ entry:
 
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
-; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
+; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -31,7 +31,7 @@ entry:
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
@@ -55,15 +55,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}soffset_max_imm:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_gs void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
   %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
   %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  %tmp5 = bitcast float 0.0 to i32
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
   ret void
 }
 
@@ -74,15 +73,14 @@ main_body:
 ; CHECK-LABEL: {{^}}soffset_no_fold:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_gs void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
   %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
   %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  %tmp5 = bitcast float 0.0 to i32
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
   ret void
 }
 
@@ -92,7 +90,7 @@ main_body:
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
@@ -102,7 +100,7 @@ entry:
 
 ; MUBUF store with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_store1:
-; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
+; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -114,7 +112,7 @@ entry:
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
@@ -135,14 +133,14 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
 define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
 define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
@@ -151,7 +149,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
 define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
@@ -160,7 +158,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
 define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
@@ -170,14 +168,13 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 
-attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #3 = { nounwind readonly }
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 94e0f96b323e..5ceef7fda826 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
 
 ; mul24 and mad24 are affected
 
@@ -96,8 +96,8 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
-; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI: s_endpgm
 define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
index 7609dcc87afa..1a323fbaa1a3 100644
--- a/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i32_mul24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
index 8a0e71d739be..fdd348403edf 100644
--- a/test/CodeGen/AMDGPU/mul_uint24.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}u32_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
new file mode 100644
index 000000000000..e1130c9125e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}define amdgpu_vs void @main
+; CHECK: main_body:
+; CHECK: LOOP.outer:
+; CHECK: LOOP:
+; CHECK:     [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(
+; CHECK:     [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1
+;
+; CHECK: Flow:
+;
+; Ensure two else.break calls, for both the inner and outer loops
+;
+; CHECK:        call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
+; CHECK-NEXT:   call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
+; CHECK-NEXT:   call void @llvm.amdgcn.end.cf
+;
+; CHECK: Flow1:
+define amdgpu_vs void @main(<4 x float> %vec, i32 %ub, i32 %cont) {
+main_body:
+  br label %LOOP.outer
+
+LOOP.outer:                                       ; preds = %ENDIF, %main_body
+  %tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %LOOP.outer
+  %tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
+  %tmp47 = add i32 %tmp45, 1
+  %tmp48 = icmp slt i32 %tmp45, %ub
+  br i1 %tmp48, label %ENDIF, label %IF
+
+IF:                                               ; preds = %LOOP
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %tmp51 = icmp eq i32 %tmp47, %cont
+  br i1 %tmp51, label %LOOP, label %LOOP.outer
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
index 73a146710a9f..d1fe794b93fb 100644
--- a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
+++ b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
@@ -1,18 +1,16 @@
 ; RUN: not llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported non-compute shaders with HSA in pixel_shader
-define void @pixel_shader() #0 {
+; CHECK: in function pixel_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_ps void @pixel_shader() #0 {
   ret void
 }
 
-define void @vertex_shader() #1 {
+; CHECK: in function vertex_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_vs void @vertex_shader() #0 {
   ret void
 }
 
-define void @geometry_shader() #2 {
+; CHECK: in function geometry_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_gs void @geometry_shader() #0 {
   ret void
 }
-
-attributes #0 = { nounwind "ShaderType"="0" }
-attributes #1 = { nounwind "ShaderType"="1" }
-attributes #2 = { nounwind "ShaderType"="2" }
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9a814b579deb..9dd99efd997c 100644
--- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -1,19 +1,24 @@
-; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
-; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
-; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=SI -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=tonga -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=r600 -mcpu=cypress -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=EG
 
+; GCN: R_AMDGPU_REL32 extern_const_addrspace
+; EG: R_AMDGPU_ABS32 extern_const_addrspace
+
+; CHECK-DAG: Name: extern_const_addrspace
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
-; FUNC-LABEL: {{^}}load_extern_const_init:
+; CHECK-DAG: Name: load_extern_const_init
 define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
+; CHECK-DAG: Name: undef_const_addrspace
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
-; FUNC-LABEL: {{^}}load_undef_const_init:
+; CHECK-DAG: Name: undef_const_addrspace
 define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index f81911aafe22..fd66b0b5d1f6 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; Make sure we don't turn the 32-bit argument load into a 16-bit
 ; load. There aren't extending scalar lods, so that would require
@@ -22,7 +22,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
 define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep.in
@@ -44,7 +44,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep.in
@@ -66,7 +66,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep.in
@@ -88,7 +88,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
 define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %load = load i64, i64 addrspace(1)* %gep.in
@@ -111,7 +111,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
 define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %load = load i64, i64 addrspace(1)* %gep.in
@@ -135,7 +135,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
   %load = load i16, i16 addrspace(1)* %gep.in
@@ -158,7 +158,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
 define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
   %load = load i64, i64 addrspace(1)* %gep.in
@@ -181,7 +181,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
   %load = load i64, i64 addrspace(1)* %gep.in
@@ -201,3 +201,15 @@ entry:
   store i32 %mask, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %bc = bitcast <2 x i32> %ld to i64
+  %hi = lshr i64 %bc, 32
+  %trunc = trunc i64 %hi to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index bc467e47dc31..0242f6d6145a 100644
--- a/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -1,5 +1,5 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; Make sure the OpenCL Image lowering pass doesn't crash when argument metadata
 ; is not in expected order.
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
index 9e514ef9970a..d6fc65fa7e83 100644
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
@@ -8,7 +8,7 @@ entry:
   br i1 %tmp0, label %if, label %endif
 
 if:
-  %id = call i32 @llvm.r600.read.tidig.x()
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
   %offset = add i32 %fold, %id
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   store i32 0, i32 addrspace(1)* %tmp1
@@ -27,7 +27,7 @@ entry:
   br i1 %tmp0, label %if, label %endif
 
 if:
-  %id = call i32 @llvm.r600.read.tidig.x()
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
   %val = or i32 %id, %fold
   store i32 %val, i32 addrspace(1)* %out
   br label %endif
@@ -63,7 +63,7 @@ entry:
 
 define void @vector_inline(<4 x i32> addrspace(1)* %out) {
 entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
   %tmp2 = add i32 %tmp0, 2
   %tmp3 = add i32 %tmp0, 3
@@ -82,7 +82,7 @@ entry:
 
 define void @imm_one_use(i32 addrspace(1)* %out) {
 entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
   store i32 %tmp1, i32 addrspace(1)* %out
   ret void
@@ -96,7 +96,7 @@ entry:
 
 define void @vector_imm(<4 x i32> addrspace(1)* %out) {
 entry:
-  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
   %tmp2 = add i32 %tmp0, 2
   %tmp3 = add i32 %tmp0, 3
@@ -109,5 +109,6 @@ entry:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
-attributes #0 = { readnone }
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index e40f18f040b7..9b90ff798ca7 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -96,7 +96,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
-  %loadb = load i64, i64 addrspace(1)* %a, align 8
+  %loadb = load i64, i64 addrspace(1)* %b, align 8
   %or = or i64 %loada, %loadb
   store i64 %or, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll
new file mode 100644
index 000000000000..32ad9aba04ed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
+
+@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
+
+define void @use_huge_lds() {
+entry:
+  %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
+  store i32 0, i32 addrspace(3)* %v0
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index f32b044198ab..ea943a533c81 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,5 +1,4 @@
-; Function Attrs: nounwind
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s
 ;
 ; CFG flattening should use parallel-and mode to generate branch conditions and
 ; then merge if-regions with the same bodies.
diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 51985af42a29..3e0d36978ad4 100644
--- a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -8,10 +8,10 @@
 ; During live interval construction, the first sub register def is
 ; incorrectly marked as dead.
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
 
diff --git a/test/CodeGen/AMDGPU/predicate-dp4.ll b/test/CodeGen/AMDGPU/predicate-dp4.ll
index 6bc187594359..7ac2bb7b0c7f 100644
--- a/test/CodeGen/AMDGPU/predicate-dp4.ll
+++ b/test/CodeGen/AMDGPU/predicate-dp4.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: PRED_SETE_INT * Pred,
 ; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
-define void @main(<4 x float> inreg) #0 {
+define amdgpu_ps void @main(<4 x float> inreg) {
 main_body:
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
@@ -11,17 +11,16 @@ main_body:
   br i1 %3, label %IF, label %ENDIF
 
 IF:                                             ; preds = %main_body
-  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  %4 = call float @llvm.r600.dot4(<4 x float> %0, <4 x float> %0)
   br label %ENDIF
 
 ENDIF:                                            ; preds = %IF, %main_body
   %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
   %6 = insertelement <4 x float> undef, float %5, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %6, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 attributes #1 = { readnone }
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
index 0ce74d97ba8e..c1af815c7b1e 100644
--- a/test/CodeGen/AMDGPU/predicates.ll
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
+; RUN: llc -spec-exec-max-speculation-cost=0 -march=r600 -r600-ir-structurize=0 -mcpu=redwood < %s | FileCheck %s
 
 ; These tests make sure the compiler is optimizing branches using predicates
 ; when it is legal to do so.
 
-; CHECK: {{^}}simple_if:
+; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = icmp sgt i32 %in, 0
-  br i1 %0, label %IF, label %ENDIF
+  %cmp0 = icmp sgt i32 %in, 0
+  br i1 %cmp0, label %IF, label %ENDIF
 
 IF:
-  %1 = shl i32 %in, 1
+  %tmp1 = shl i32 %in, 1
   br label %ENDIF
 
 ENDIF:
-  %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
-  store i32 %2, i32 addrspace(1)* %out
+  %tmp2 = phi i32 [ %in, %entry ], [ %tmp1, %IF ]
+  store i32 %tmp2, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: {{^}}simple_if_else:
+; CHECK-LABEL: {{^}}simple_if_else:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
@@ -44,7 +44,7 @@ ENDIF:
   ret void
 }
 
-; CHECK: {{^}}nested_if:
+; CHECK-LABEL: {{^}}nested_if:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP
@@ -71,7 +71,7 @@ ENDIF:
   ret void
 }
 
-; CHECK: {{^}}nested_if_else:
+; CHECK-LABEL: {{^}}nested_if_else:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
new file mode 100644
index 000000000000..cd8fb22e620a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -0,0 +1,252 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL -check-prefix=HSA_ELTGE8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL -check-prefix=HSA-ELTGE8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s
+
+
+; ALL-LABEL: {{^}}private_elt_size_v4i32:
+
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x <4 x i32>], align 16
+  %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1
+  store <4 x i32> zeroinitializer, <4 x i32>* %gep0
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %gep1
+  %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 %index
+  %load = load <4 x i32>, <4 x i32>* %gep2
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_v8i32:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
+
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
+
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x <8 x i32>], align 16
+  %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1
+  store <8 x i32> zeroinitializer, <8 x i32>* %gep0
+  store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %gep1
+  %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 %index
+  %load = load <8 x i32>, <8 x i32>* %gep2
+  store <8 x i32> %load, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+; ALL-LABEL: {{^}}private_elt_size_i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x i64], align 16
+  %gep0 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 34359738602, i64* %gep1
+  %gep2 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_f64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x double], align 16
+  %gep0 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 1
+  store double 0.0, double* %gep0
+  store double 4.0, double* %gep1
+  %gep2 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 %index
+  %load = load double, double* %gep2
+  store double %load, double addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_v2i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x <2 x i64>], align 16
+  %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 1
+  store <2 x i64> zeroinitializer, <2 x i64>* %gep0
+  store <2 x i64> <i64 1, i64 2>, <2 x i64>* %gep1
+  %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 %index
+  %load = load <2 x i64>, <2 x i64>* %gep2
+  store <2 x i64> %load, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
index a008ac98a43b..eea10c862238 100644
--- a/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -7,11 +7,11 @@
 define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
   %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
   store i32 %tmp4, i32 addrspace(1)* %out
   ret void
@@ -20,11 +20,11 @@ entry:
 define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
   %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
   %val = extractvalue { i32, i1 } %tmp4, 0
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
index 6b18a19f1956..8ba0b70dbdbb 100644
--- a/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn %s -o /dev/null 2>&1 | FileCheck %s
 ; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
 
 ; Make sure promote alloca pass doesn't crash
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
new file mode 100644
index 000000000000..883bdc1ce265
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -0,0 +1,300 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
+; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
+
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32, i32* %a.indirect.ptr
+  %b.indirect = load i32, i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32, i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %b = load i32, i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32, i32* %x_ptr
+  %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32, i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32, i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16, i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8, i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
+  %5 = call i32 @llvm.r600.read.tidig.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %0 = alloca [3 x i8], align 1
+  %1 = alloca [2 x i8], align 1
+  %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1
+  store i8 0, i8* %2
+  store i8 1, i8* %3
+  store i8 2, i8* %4
+  store i8 1, i8* %5
+  store i8 0, i8* %6
+  %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8, i8* %7
+  %10 = load i8, i8* %8
+  %11 = add i8 %9, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i8]]
+  %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep0
+  store i8 1, i8* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8, i8* %gep2
+  %sext = sext i8 %load to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i64]]
+  %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 1, i64* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x %struct.pair32]]
+  %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x %struct.pair32]
+  %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %cmp = icmp eq i32 %in, 0
+  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+  %load = load i32, i32* %sel
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
+  %tmp5 = load i32, i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT: !0 = !{i32 0, i32 2048}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
deleted file mode 100644
index 79778eebd802..000000000000
--- a/test/CodeGen/AMDGPU/private-memory.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}mova_same_clause:
-
-; R600: LDS_WRITE
-; R600: LDS_WRITE
-; R600: LDS_READ
-; R600: LDS_READ
-
-; HSA-PROMOTE: .amd_kernel_code_t
-; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
-; HSA-PROMOTE: .end_amd_kernel_code_t
-
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-
-; HSA-ALLOCA: .amd_kernel_code_t
-; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
-; by 4 bytes.
-; HSA-ALLOCA: workitem_private_segment_byte_size = 24
-; HSA-ALLOCA: .end_amd_kernel_code_t
-
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
-entry:
-  %stack = alloca [5 x i32], align 4
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
-  store i32 4, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
-  store i32 5, i32* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
-  %2 = load i32, i32* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
-  %3 = load i32, i32* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
-  ret void
-}
-
-; This test checks that the stack offset is calculated correctly for structs.
-; All register loads/stores should be optimized away, so there shouldn't be
-; any MOVA instructions.
-;
-; XXX: This generated code has unnecessary MOVs, we should be able to optimize
-; this.
-
-; FUNC-LABEL: {{^}}multiple_structs:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-; SI-NOT: v_movrel
-%struct.point = type { i32, i32 }
-
-define void @multiple_structs(i32 addrspace(1)* %out) {
-entry:
-  %a = alloca %struct.point
-  %b = alloca %struct.point
-  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
-  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
-  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
-  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
-  store i32 0, i32* %a.x.ptr
-  store i32 1, i32* %a.y.ptr
-  store i32 2, i32* %b.x.ptr
-  store i32 3, i32* %b.y.ptr
-  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
-  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
-  %a.indirect = load i32, i32* %a.indirect.ptr
-  %b.indirect = load i32, i32* %b.indirect.ptr
-  %0 = add i32 %a.indirect, %b.indirect
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test direct access of a private array inside a loop.  The private array
-; loads and stores should be lowered to copies, so there shouldn't be any
-; MOVA instructions.
-
-; FUNC-LABEL: {{^}}direct_loop:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
-  %prv_array_const = alloca [2 x i32]
-  %prv_array = alloca [2 x i32]
-  %a = load i32, i32 addrspace(1)* %in
-  %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %b = load i32, i32 addrspace(1)* %b_src_ptr
-  %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
-  store i32 %a, i32* %a_dst_ptr
-  %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
-  store i32 %b, i32* %b_dst_ptr
-  br label %for.body
-
-for.body:
-  %inc = phi i32 [0, %entry], [%count, %for.body]
-  %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
-  %x = load i32, i32* %x_ptr
-  %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
-  %y = load i32, i32* %y_ptr
-  %xy = add i32 %x, %y
-  store i32 %xy, i32* %y_ptr
-  %count = add i32 %inc, 1
-  %done = icmp eq i32 %count, 4095
-  br i1 %done, label %for.end, label %for.body
-
-for.end:
-  %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
-  %value = load i32, i32* %value_ptr
-  store i32 %value, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}short_array:
-
-; R600: MOVA_INT
-
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
-; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
-define void @short_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = alloca [2 x i16]
-  %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
-  store i16 0, i16* %1
-  store i16 1, i16* %2
-  %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
-  %4 = load i16, i16* %3
-  %5 = sext i16 %4 to i32
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}char_array:
-
-; R600: MOVA_INT
-
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %0 = alloca [2 x i8]
-  %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
-  store i8 0, i8* %1
-  store i8 1, i8* %2
-  %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
-  %4 = load i8, i8* %3
-  %5 = sext i8 %4 to i32
-  store i32 %5, i32 addrspace(1)* %out
-  ret void
-
-}
-
-; Make sure we don't overwrite workitem information with private memory
-
-; FUNC-LABEL: {{^}}work_item_info:
-; R600-NOT: MOV T0.X
-; Additional check in case the move ends up in the last slot
-; R600-NOT: MOV * TO.X
-
-; SI-NOT: v_mov_b32_e{{(32|64)}} v0
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = alloca [2 x i32]
-  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
-  store i32 0, i32* %1
-  store i32 1, i32* %2
-  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
-  %4 = load i32, i32* %3
-  %5 = call i32 @llvm.r600.read.tidig.x()
-  %6 = add i32 %4, %5
-  store i32 %6, i32 addrspace(1)* %out
-  ret void
-}
-
-; Test that two stack objects are not stored in the same register
-; The second stack object should be in T3.X
-; FUNC-LABEL: {{^}}no_overlap:
-; R600_CHECK: MOV
-; R600_CHECK: [[CHAN:[XYZW]]]+
-; R600-NOT: [[CHAN]]+
-; SI: v_mov_b32_e32 v3
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = alloca [3 x i8], align 1
-  %1 = alloca [2 x i8], align 1
-  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
-  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
-  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
-  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
-  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
-  store i8 0, i8* %2
-  store i8 1, i8* %3
-  store i8 2, i8* %4
-  store i8 1, i8* %5
-  store i8 0, i8* %6
-  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
-  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
-  %9 = load i8, i8* %7
-  %10 = load i8, i8* %8
-  %11 = add i8 %9, %10
-  %12 = sext i8 %11 to i32
-  store i32 %12, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i8]]
-  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
-  store i8 0, i8* %gep0
-  store i8 1, i8* %gep1
-  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i8, i8* %gep2
-  %sext = sext i8 %load to i32
-  store i32 %sext, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i32]]
-  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x i64]]
-  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
-  store i64 0, i64* %gep0
-  store i64 1, i64* %gep1
-  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i64, i64* %gep2
-  store i64 %load, i64 addrspace(1)* %out
-  ret void
-}
-
-%struct.pair32 = type { i32, i32 }
-
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x [2 x %struct.pair32]]
-  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
-  %alloca = alloca [2 x %struct.pair32]
-  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
-  store i32 0, i32* %gep0
-  store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
-  %load = load i32, i32* %gep2
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
-  %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
-  store i32 0, i32* %tmp1
-  store i32 1, i32* %tmp2
-  %cmp = icmp eq i32 %in, 0
-  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
-  %load = load i32, i32* %sel
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
-; finds one, it should stop trying to promote.
-
-; FUNC-LABEL: ptrtoint:
-; SI-NOT: ds_write
-; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-  %alloca = alloca [16 x i32]
-  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
-  store i32 5, i32* %tmp0
-  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
-  %tmp2 = add i32 %tmp1, 5
-  %tmp3 = inttoptr i32 %tmp2 to i32*
-  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
-  %tmp5 = load i32, i32* %tmp4
-  store i32 %tmp5, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
new file mode 100644
index 000000000000..3bd0aecf7aa9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; Make sure this allocates the correct size if the alloca has a non-0
+; number of elements.
+
+; CHECK-LABEL: @array_alloca(
+; CHECK: %stack = alloca i32, i32 5, align 4
+define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca i32, i32 5, align 4
+  %ld0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
+  %ld2 = load i32, i32* %arrayidx10, align 4
+  store i32 %ld2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
+  %ld3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %ld3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK-LABEL: @array_alloca_dynamic(
+; CHECK: %stack = alloca i32, i32 %size, align 4
+define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+entry:
+  %stack = alloca i32, i32 %size, align 4
+  %ld0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
+  %ld2 = load i32, i32* %arrayidx10, align 4
+  store i32 %ld2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
+  %ld3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %ld3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 10739df08379..82030f377d9f 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -6,13 +6,14 @@
 declare void @foo(float*) #0
 declare void @foo.varargs(...) #0
 
-; CHECK: error: unsupported call to function foo in crash_call_constexpr_cast
+; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
 define void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32
   call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
   ret void
 }
 
+; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
 define void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32
   call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
new file mode 100644
index 000000000000..eb0d0cc62697
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
+
+
+@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
+@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
+
+; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR: alloca [10 x i32]
+; ASM-LABEL: {{^}}promote_alloca_size_256:
+; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
+
+define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [10 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+  %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0
+  store i32 %tmp3, i32 addrspace(3)* %v0
+  %v1 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array1, i32 0, i32 0, i32 0
+  store i32 %tmp3, i32 addrspace(3)* %v1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
new file mode 100644
index 000000000000..6a9ec31696d2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare {}* @llvm.invariant.start(i64, i8* nocapture) #0
+declare void @llvm.invariant.end({}*, i64, i8* nocapture) #0
+declare i8* @llvm.invariant.group.barrier(i8*) #1
+
+; GCN-LABEL: {{^}}use_invariant_promotable_lds:
+; GCN: buffer_load_dword
+; GCN: ds_write_b32
+define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+bb:
+  %tmp = alloca i32, align 4
+  %tmp1 = bitcast i32* %tmp to i8*
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp3 = load i32, i32 addrspace(1)* %tmp2
+  store i32 %tmp3, i32* %tmp
+  %tmp4 = call {}* @llvm.invariant.start(i64 4, i8* %tmp1) #0
+  call void @llvm.invariant.end({}* %tmp4, i64 4, i8* %tmp1) #0
+  %tmp5 = call i8* @llvm.invariant.group.barrier(i8* %tmp1) #1
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
new file mode 100644
index 000000000000..eeda19fa27ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca %s | FileCheck -check-prefix=OPT %s
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; OPT-LABEL: @use_lifetime_promotable_lds(
+; OPT-NOT: alloca i32
+; OPT-NOT: llvm.lifetime
+; OPT: store i32 %tmp3, i32 addrspace(3)*
+define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+bb:
+  %tmp = alloca i32, align 4
+  %tmp1 = bitcast i32* %tmp to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %tmp1)
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp3 = load i32, i32 addrspace(1)* %tmp2
+  store i32 %tmp3, i32* %tmp
+  call void @llvm.lifetime.end(i64 4, i8* %tmp1)
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
new file mode 100644
index 000000000000..01ecb638b033
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) #0
+declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) #0
+
+declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) #0
+declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) #0
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1
+
+; CHECK-LABEL: @promote_with_memcpy(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
+define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %alloca = alloca [17 x i32], align 4
+  %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p0i8.p1i8.i32(i8* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %out.bc, i8* %alloca.bc, i32 68, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @promote_with_memmove(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
+define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %alloca = alloca [17 x i32], align 4
+  %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memmove.p0i8.p1i8.i32(i8* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+  call void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* %out.bc, i8* %alloca.bc, i32 68, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @promote_with_memset(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
+define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %alloca = alloca [17 x i32], align 4
+  %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memset.p0i8.i32(i8* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @promote_with_objectsize(
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false)
+define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+  %alloca = alloca [17 x i32], align 4
+  %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+  %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
+  store i32 %size, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-max-waves-per-eu"="3" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
new file mode 100644
index 000000000000..7c5a5182bc8e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -0,0 +1,38 @@
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
+; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
+; NOOPTS-NOT ds_write
+; OPTS: ds_write
+define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
+; ALL: workgroup_group_segment_byte_size = 0{{$}}
+; ALL-NOT ds_write
+define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
+attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
new file mode 100644
index 000000000000..46fe307a17fe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+
+; This shows that the amount of LDS estimate is sensitive to the order
+; of the LDS globals.
+
+; Both of these functions use the same amount of LDS, but the total
+; changes depending on the visit order of first use.
+
+; The one with the suboptimal order resulting in extra padding exceeds
+; the desired limit
+
+; The padding estimate heuristic used by the promote alloca pass
+; is mostly determined by the order of the globals,
+
+; Raw usage = 1060 bytes
+; Rounded usage:
+; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
+; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
+
+; At default occupancy guess of 7, 2340 bytes available total.
+
+; 1280 need to be left to promote alloca
+; optimally packed, this requires
+
+
+@lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
+@lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
+@lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
+
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_0:
+; GCN: workgroup_group_segment_byte_size = 2340
+define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_1:
+; GCN: workgroup_group_segment_byte_size = 2352
+define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+  ret void
+}
+
+@lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
+@lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
+
+; The guess from the alignment padding pushes this over the determined
+; size limit, so it isn't promoted
+
+; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
+; GCN: workgroup_group_segment_byte_size = 1060
+define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
+
+  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
+
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-shaders.ll b/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
new file mode 100644
index 000000000000..d40fca9f4fd5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
+
+; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+; IR: alloca [5 x i32]
+; ASM-LABEL: {{^}}promote_alloca_shaders:
+; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
+
+define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx4, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx5
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx6
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 2ee98cc3d2d2..307eca712cc8 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; Pointer value is stored in a candidate for LDS usage.
 
@@ -11,6 +12,18 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
   ret void
 }
 
+; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
+; GCN: buffer_store_dword v
+define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  store float 0.0, float *%tmp0
+  store float 0.0, float *%tmp1
+  store volatile float* %tmp0, float* addrspace(1)* %ptr
+  store volatile float* %tmp1, float* addrspace(1)* %ptr
+  ret void
+}
+
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_gep:
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@@ -36,17 +49,27 @@ bb:
 define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
   %tmp0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
+  %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
   store i32 0, i32* %x
   store i32 1, i32* %y
   store i32 2, i32* %z
   store i32 3, i32* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
+  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
   store i32* %tmp1, i32* addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-NOT: ds_
+define void @stored_fi_to_self() #0 {
+  %tmp = alloca i32*
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+  %bitcast = bitcast i32** %tmp to i32*
+  store volatile i32* %bitcast, i32** %tmp
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
new file mode 100644
index 000000000000..857e547aa03b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This normally would be fixed by instcombine to be compare to the GEP
+; indices
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
+define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %cmp = icmp eq i32* %ptr0, null
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
+define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %cmp = icmp eq i32* null, %ptr0
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
+; CHECK: %alloca = alloca [16 x i32], align 4
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+; CHECK: %ptr1 = call i32* @get_unknown_pointer()
+; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = call i32* @get_unknown_pointer()
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
new file mode 100644
index 000000000000..a0ad564a6c8f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -0,0 +1,204 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+
+; CHECK-LABEL: @branch_ptr_var_same_alloca(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
+define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %endif
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ null, %entry ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
+; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
+define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %endif
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ null, %entry ], [ %arrayidx0, %if ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @one_phi_value(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
+; CHECK:  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: br label %exit
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @one_phi_value(i32 %a) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %exit
+
+exit:
+  %phi.ptr = phi i32* [ %arrayidx0, %entry ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @branch_ptr_alloca_unknown_obj(
+; CHECK: %alloca = alloca [64 x i32], align 4
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = call i32* @get_unknown_pointer()
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32* %phi.ptr, align 4
+define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = call i32* @get_unknown_pointer()
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; kernel void ptr_induction_var_same_alloca(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+
+;     #pragma nounroll
+;     for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; FIXME: This should be promotable. We need to use
+; GetUnderlyingObjects when looking at the icmp user.
+
+; CHECK-LABEL: @ptr_induction_var_same_alloca(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+define void @ptr_induction_var_same_alloca() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %arrayidx1
+  br i1 %cmp, label %for.cond.cleanup, label %for.body
+}
+
+
+; extern int* get_unknown_pointer(void);
+
+; kernel void ptr_induction_var_alloca_unknown(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+;
+;     for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
+define void @ptr_induction_var_alloca_unknown() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %call = tail call i32* @get_unknown_pointer() #2
+  %cmp.7 = icmp eq i32* %arrayidx, %call
+  br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %call
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
new file mode 100644
index 000000000000..bb13adb19852
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -0,0 +1,133 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
+; CHECK: %alloca = alloca i32
+; CHECK: select i1 undef, i32* undef, i32* %alloca
+define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+  %alloca = alloca i32, align 4
+  %select = select i1 undef, i32* undef, i32* %alloca
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; FIXME: This should be promotable but requires knowing that both will be promoted first.
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
+; CHECK: %alloca0 = alloca i32, i32 16, align 4
+; CHECK: %alloca1 = alloca i32, i32 16, align 4
+; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
+define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+  %alloca0 = alloca i32, i32 16, align 4
+  %alloca1 = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
+; CHECK: getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_select_input_select.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %b
+; CHECK: %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %c
+; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
+; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
+define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32* %ptr0, i32* %ptr1
+  %select1 = select i1 undef, i32* %select0, i32* %ptr2
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  store i32 0, i32* %ptr0
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32* undef, i32* %ptr2
+  store i32 0, i32* %ptr1
+  br label %bb2
+
+bb2:
+  %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
+  %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @select_null_rhs(
+; CHECK-NOT: alloca
+; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
+define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+bb:
+  %tmp = alloca double, align 8
+  store double 0.000000e+00, double* %tmp, align 8
+  %tmp2 = icmp eq i32 %arg1, 0
+  %tmp3 = select i1 %tmp2, double* %tmp, double* null
+  store double 1.000000e+00, double* %tmp3, align 8
+  %tmp4 = load double, double* %tmp, align 8
+  store double %tmp4, double addrspace(1)* %arg
+  ret void
+}
+
+; CHECK-LABEL: @select_null_lhs(
+; CHECK-NOT: alloca
+; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
+define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+bb:
+  %tmp = alloca double, align 8
+  store double 0.000000e+00, double* %tmp, align 8
+  %tmp2 = icmp eq i32 %arg1, 0
+  %tmp3 = select i1 %tmp2, double* null, double* %tmp
+  store double 1.000000e+00, double* %tmp3, align 8
+  %tmp4 = load double, double* %tmp, align 8
+  store double %tmp4, double addrspace(1)* %arg
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "amdgpu-max-waves-per-eu"="1" }
+attributes #1 = { norecurse nounwind }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
new file mode 100644
index 000000000000..e331731f90f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This is just an arbitrary intrinisic that shouldn't ever need to be
+; handled to ensure it doesn't crash.
+
+declare void @llvm.stackrestore(i8*) #2
+
+; CHECK-LABEL: @try_promote_unhandled_intrinsic(
+; CHECK: alloca
+; CHECK: call void @llvm.stackrestore(i8* %tmp1)
+define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+bb:
+  %tmp = alloca i32, align 4
+  %tmp1 = bitcast i32* %tmp to i8*
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp3 = load i32, i32 addrspace(1)* %tmp2
+  store i32 %tmp3, i32* %tmp
+  call void @llvm.stackrestore(i8* %tmp1)
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
new file mode 100644
index 000000000000..f9de38839bc5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @volatile_load(
+; CHECK: alloca [5 x i32]
+; CHECK load volatile i32, i32*
+define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %load = load volatile i32, i32* %arrayidx1
+  store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @volatile_store(
+; CHECK: alloca [5 x i32]
+; CHECK store volatile i32 %tmp, i32*
+define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  store volatile i32 %tmp, i32* %arrayidx1
+ ret void
+}
+
+; Has on OK non-volatile user but also a volatile user
+; CHECK-LABEL: @volatile_and_non_volatile_load(
+; CHECK: alloca double
+; CHECK: load double
+; CHECK: load volatile double
+define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+bb:
+  %tmp = alloca double, align 8
+  store double 0.000000e+00, double* %tmp, align 8
+
+  %tmp4 = load double, double* %tmp, align 8
+  %tmp5 = load volatile double, double* %tmp, align 8
+
+  store double %tmp4, double addrspace(1)* %arg
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/pv-packing.ll b/test/CodeGen/AMDGPU/pv-packing.ll
index abeae563ff3f..b01c00daede3 100644
--- a/test/CodeGen/AMDGPU/pv-packing.ll
+++ b/test/CodeGen/AMDGPU/pv-packing.ll
@@ -3,7 +3,7 @@
 ;CHECK: DOT4  T{{[0-9]\.X}}
 ;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -16,7 +16,7 @@ main_body:
   %8 = extractelement <4 x float> %reg3, i32 2
   %9 = load <4 x float>, <4 x float> addrspace(8)* null
   %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+  %11 = call float @llvm.r600.dot4(<4 x float> %9, <4 x float> %9)
   %12 = fmul float %0, %3
   %13 = fadd float %12, %6
   %14 = fmul float %1, %4
@@ -29,17 +29,16 @@ main_body:
   %21 = insertelement <4 x float> %20, float %15, i32 1
   %22 = insertelement <4 x float> %21, float %17, i32 2
   %23 = insertelement <4 x float> %22, float %19, i32 3
-  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+  %24 = call float @llvm.r600.dot4(<4 x float> %23, <4 x float> %10)
   %25 = insertelement <4 x float> undef, float %24, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %25, i32 0, i32 2)
   ret void
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="1" }
 attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
index 9a57dd19765a..d5f9833d6ad0 100644
--- a/test/CodeGen/AMDGPU/pv.ll
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -3,7 +3,7 @@
 ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
 ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -101,9 +101,9 @@ main_body:
   %93 = insertelement <4 x float> %92, float %5, i32 1
   %94 = insertelement <4 x float> %93, float %6, i32 2
   %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
-  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
-  %97 = call float @fabs(float %96)
-  %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
+  %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
+  %97 = call float @llvm.fabs.f32(float %96)
+  %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97)
   %99 = fmul float %4, %98
   %100 = fmul float %5, %98
   %101 = fmul float %6, %98
@@ -119,10 +119,10 @@ main_body:
   %111 = extractelement <4 x float> %110, i32 2
   %112 = fmul float %111, %10
   %113 = fadd float %112, %22
-  %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
-  %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
-  %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
-  %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00)
+  %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00)
+  %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00)
+  %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
   %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %119 = extractelement <4 x float> %118, i32 0
   %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
@@ -137,7 +137,7 @@ main_body:
   %129 = insertelement <4 x float> %128, float %121, i32 1
   %130 = insertelement <4 x float> %129, float %123, i32 2
   %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
-  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
+  %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
   %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %134 = extractelement <4 x float> %133, i32 0
   %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
@@ -152,7 +152,7 @@ main_body:
   %144 = insertelement <4 x float> %143, float %136, i32 1
   %145 = insertelement <4 x float> %144, float %138, i32 2
   %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
-  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
+  %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
   %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %149 = extractelement <4 x float> %148, i32 0
   %150 = fmul float %149, %8
@@ -202,40 +202,39 @@ main_body:
   %194 = fadd float %193, %188
   %195 = fmul float %181, %174
   %196 = fadd float %195, %190
-  %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
-  %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
+  %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00)
+  %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00)
   %200 = insertelement <4 x float> undef, float %75, i32 0
   %201 = insertelement <4 x float> %200, float %79, i32 1
   %202 = insertelement <4 x float> %201, float %83, i32 2
   %203 = insertelement <4 x float> %202, float %87, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %203, i32 60, i32 1)
   %204 = insertelement <4 x float> undef, float %197, i32 0
   %205 = insertelement <4 x float> %204, float %198, i32 1
   %206 = insertelement <4 x float> %205, float %199, i32 2
   %207 = insertelement <4 x float> %206, float %117, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %207, i32 0, i32 2)
   ret void
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
-declare float @fabs(float) #2
+declare float @llvm.fabs.f32(float) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
 
 ; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #3
+declare float @llvm.pow.f32(float, float) #2
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #3
 
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/r600-encoding.ll b/test/CodeGen/AMDGPU/r600-encoding.ll
index 3a82ee30a328..e14b30680ba1 100644
--- a/test/CodeGen/AMDGPU/r600-encoding.ll
+++ b/test/CodeGen/AMDGPU/r600-encoding.ll
@@ -10,16 +10,14 @@
 ; R600: {{^}}test:
 ; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
 entry:
   %r0 = extractelement <4 x float> %reg0, i32 0
   %r1 = extractelement <4 x float> %reg0, i32 1
   %r2 = fmul float %r0, %r1
   %vec = insertelement <4 x float> undef, float %r2, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
   ret void
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/r600-export-fix.ll b/test/CodeGen/AMDGPU/r600-export-fix.ll
index 7cb80195b368..7d86f9e3b3f1 100644
--- a/test/CodeGen/AMDGPU/r600-export-fix.ll
+++ b/test/CodeGen/AMDGPU/r600-export-fix.ll
@@ -10,7 +10,7 @@
 ;CHECK: EXPORT T{{[0-9]}}.0000
 
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -98,45 +98,43 @@ main_body:
   %83 = insertelement <4 x float> %82, float %55, i32 1
   %84 = insertelement <4 x float> %83, float %59, i32 2
   %85 = insertelement <4 x float> %84, float %63, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %85, i32 60, i32 1)
   %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
   %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
   %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %89, i32 0, i32 2)
   %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
   %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
   %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %93, i32 1, i32 2)
   %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %95 = insertelement <4 x float> %94, float %65, i32 1
   %96 = insertelement <4 x float> %95, float %67, i32 2
   %97 = insertelement <4 x float> %96, float %69, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %97, i32 2, i32 2)
   %98 = insertelement <4 x float> undef, float %77, i32 0
   %99 = insertelement <4 x float> %98, float %79, i32 1
   %100 = insertelement <4 x float> %99, float %81, i32 2
   %101 = insertelement <4 x float> %100, float %71, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %101, i32 3, i32 2)
   %102 = insertelement <4 x float> undef, float %73, i32 0
   %103 = insertelement <4 x float> %102, float %75, i32 1
   %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
   %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %105, i32 4, i32 2)
   %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
   %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
   %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %109, i32 5, i32 2)
   %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
   %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
   %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %113, i32 6, i32 2)
   ret void
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
index f388f8ffe293..461caf5b5d20 100644
--- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -1,58 +1,58 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
+; RUN: llc -march=r600 -mcpu=cayman < %s
 
-define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %arg, <4 x float> inreg %arg1) {
 main_body:
-  %2 = extractelement <4 x float> %0, i32 0
-  %3 = extractelement <4 x float> %0, i32 1
-  %4 = extractelement <4 x float> %0, i32 2
-  %5 = extractelement <4 x float> %0, i32 3
-  %6 = insertelement <4 x float> undef, float %2, i32 0
-  %7 = insertelement <4 x float> %6, float %3, i32 1
-  %8 = insertelement <4 x float> %7, float %4, i32 2
-  %9 = insertelement <4 x float> %8, float %5, i32 3
-  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = extractelement <4 x float> %10, i32 1
-  %13 = extractelement <4 x float> %10, i32 2
-  %14 = extractelement <4 x float> %10, i32 3
-  %15 = call float @fabs(float %13)
-  %16 = fdiv float 1.000000e+00, %15
-  %17 = fmul float %11, %16
-  %18 = fadd float %17, 1.500000e+00
-  %19 = fmul float %12, %16
-  %20 = fadd float %19, 1.500000e+00
-  %21 = insertelement <4 x float> undef, float %20, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %14, i32 2
-  %24 = insertelement <4 x float> %23, float %5, i32 3
-  %25 = extractelement <4 x float> %24, i32 0
-  %26 = extractelement <4 x float> %24, i32 1
-  %27 = extractelement <4 x float> %24, i32 2
-  %28 = extractelement <4 x float> %24, i32 3
-  %29 = insertelement <4 x float> undef, float %25, i32 0
-  %30 = insertelement <4 x float> %29, float %26, i32 1
-  %31 = insertelement <4 x float> %30, float %27, i32 2
-  %32 = insertelement <4 x float> %31, float %28, i32 3
-  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
-  %34 = extractelement <4 x float> %33, i32 0
-  %35 = insertelement <4 x float> undef, float %34, i32 0
-  %36 = insertelement <4 x float> %35, float %34, i32 1
-  %37 = insertelement <4 x float> %36, float %34, i32 2
-  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+  %tmp = extractelement <4 x float> %arg, i32 0
+  %tmp2 = extractelement <4 x float> %arg, i32 1
+  %tmp3 = extractelement <4 x float> %arg, i32 2
+  %tmp4 = extractelement <4 x float> %arg, i32 3
+  %tmp5 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
+  %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp11 = extractelement <4 x float> %tmp9, i32 1
+  %tmp12 = extractelement <4 x float> %tmp9, i32 2
+  %tmp13 = extractelement <4 x float> %tmp9, i32 3
+  %tmp14 = call float @fabs(float %tmp12)
+  %tmp15 = fdiv float 1.000000e+00, %tmp14
+  %tmp16 = fmul float %tmp10, %tmp15
+  %tmp17 = fadd float %tmp16, 1.500000e+00
+  %tmp18 = fmul float %tmp11, %tmp15
+  %tmp19 = fadd float %tmp18, 1.500000e+00
+  %tmp20 = insertelement <4 x float> undef, float %tmp19, i32 0
+  %tmp21 = insertelement <4 x float> %tmp20, float %tmp17, i32 1
+  %tmp22 = insertelement <4 x float> %tmp21, float %tmp13, i32 2
+  %tmp23 = insertelement <4 x float> %tmp22, float %tmp4, i32 3
+  %tmp24 = extractelement <4 x float> %tmp23, i32 0
+  %tmp25 = extractelement <4 x float> %tmp23, i32 1
+  %tmp26 = extractelement <4 x float> %tmp23, i32 2
+  %tmp27 = extractelement <4 x float> %tmp23, i32 3
+  %tmp28 = insertelement <4 x float> undef, float %tmp24, i32 0
+  %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 1
+  %tmp30 = insertelement <4 x float> %tmp29, float %tmp26, i32 2
+  %tmp31 = insertelement <4 x float> %tmp30, float %tmp27, i32 3
+  %tmp32 = shufflevector <4 x float> %tmp31, <4 x float> %tmp31, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp33 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp32, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp34 = extractelement <4 x float> %tmp33, i32 0
+  %tmp35 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp36 = insertelement <4 x float> %tmp35, float %tmp34, i32 1
+  %tmp37 = insertelement <4 x float> %tmp36, float %tmp34, i32 2
+  %tmp38 = insertelement <4 x float> %tmp37, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp38, i32 0, i32 0)
   ret void
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
 
 ; Function Attrs: readnone
-declare float @fabs(float) #1
+declare float @fabs(float) #0
 
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/r600.private-memory.ll b/test/CodeGen/AMDGPU/r600.private-memory.ll
new file mode 100644
index 000000000000..f406c160cbbe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
+  %5 = call i32 @llvm.r600.read.tidig.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
new file mode 100644
index 000000000000..ff248a89cedc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}tgid_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
+define void @tgid_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
+define void @tgid_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
+define void @tgid_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
+define void @tidig_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
+define void @tidig_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
+define void @tidig_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_implicit:
+; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
+; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
+define void @test_implicit(i32 addrspace(1)* %out) #1 {
+  %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+  %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
+  %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
+  %value = load i32, i32 addrspace(7)* %gep
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_implicit_dyn:
+; 36 prepended implicit bytes + 8(out pointer + in) = 44
+; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
+define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+  %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+  %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
+  %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
+  %value = load i32, i32 addrspace(7)* %gep
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+
+
+; DEPRECATED but R600 only
+
+; FUNC-LABEL: {{^}}workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].Z
+define void @workdim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.workdim() #0
+
+declare i8 addrspace(7)* @llvm.r600.implicitarg.ptr() #0
+
+declare i32 @llvm.r600.read.tgid.x() #0
+declare i32 @llvm.r600.read.tgid.y() #0
+declare i32 @llvm.r600.read.tgid.z() #0
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.r600.read.tidig.z() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/r600cfg.ll b/test/CodeGen/AMDGPU/r600cfg.ll
index c7b9d65220f3..2996a1053da5 100644
--- a/test/CodeGen/AMDGPU/r600cfg.ll
+++ b/test/CodeGen/AMDGPU/r600cfg.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -32,27 +32,27 @@ IF41:                                             ; preds = %LOOP
   %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
   %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
   %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+  call void @llvm.r600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
   %20 = insertelement <4 x float> undef, float %0, i32 0
   %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
   %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
   %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+  call void @llvm.r600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
   %24 = insertelement <4 x float> undef, float %0, i32 0
   %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
   %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
   %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+  call void @llvm.r600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
   %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
   %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
   %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
   %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %31, i32 60, i32 1)
   %32 = insertelement <4 x float> undef, float %0, i32 0
   %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
   %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
   %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %35, i32 0, i32 2)
   ret void
 
 ENDIF40:                                          ; preds = %LOOP
@@ -112,8 +112,6 @@ ENDIF48:                                          ; preds = %LOOP47
   br label %LOOP47
 }
 
-declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+declare void @llvm.r600.store.stream.output(<4 x float>, i32, i32, i32)
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
new file mode 100644
index 000000000000..b1d422062543
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
+; EG: RECIP_IEEE
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
new file mode 100644
index 000000000000..a5581d73cb25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid register "flat_scratch_lo" for subtarget.
+
+declare i32 @llvm.read_register.i32(metadata) #0
+
+define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+  store volatile i32 0, i32 addrspace(3)* undef
+  %m0 = call i32 @llvm.read_register.i32(metadata !0)
+  store i32 %m0, i32 addrspace(1)* %out
+  ret void
+}
+
+!0 = !{!"flat_scratch_lo"}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
new file mode 100644
index 000000000000..2617ad7402ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for register "exec".
+
+declare i32 @llvm.read_register.i32(metadata) #0
+
+define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+  store volatile i32 0, i32 addrspace(3)* undef
+  %m0 = call i32 @llvm.read_register.i32(metadata !0)
+  store i32 %m0, i32 addrspace(1)* %out
+  ret void
+}
+
+!0 = !{!"exec"}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
new file mode 100644
index 000000000000..dcde8a1894fc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for register "m0".
+
+declare i64 @llvm.read_register.i64(metadata) #0
+
+define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+  %exec = call i64 @llvm.read_register.i64(metadata !0)
+  store i64 %exec, i64 addrspace(1)* %out
+  ret void
+}
+
+!0 = !{!"m0"}
diff --git a/test/CodeGen/AMDGPU/read_register.ll b/test/CodeGen/AMDGPU/read_register.ll
new file mode 100644
index 000000000000..58a9e34b77f2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read_register.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.read_register.i32(metadata) #0
+declare i64 @llvm.read_register.i64(metadata) #0
+
+; CHECK-LABEL: {{^}}test_read_m0:
+; CHECK: s_mov_b32 m0, -1
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+  store volatile i32 0, i32 addrspace(3)* undef
+  %m0 = call i32 @llvm.read_register.i32(metadata !0)
+  store i32 %m0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec:
+; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
+; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+  %exec = call i64 @llvm.read_register.i64(metadata !1)
+  store i64 %exec, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch:
+; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
+; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+  %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
+  store i64 %flat_scratch, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+  %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
+  store i32 %flat_scratch_lo, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+  %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
+  store i32 %flat_scratch_hi, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec_lo:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+  %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
+  store i32 %exec_lo, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec_hi:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+  %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
+  store i32 %exec_hi, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"m0"}
+!1 = !{!"exec"}
+!2 = !{!"flat_scratch"}
+!3 = !{!"flat_scratch_lo"}
+!4 = !{!"flat_scratch_hi"}
+!5 = !{!"exec_lo"}
+!6 = !{!"exec_hi"}
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
new file mode 100644
index 000000000000..e6d0efd0ff94
--- /dev/null
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.readcyclecounter() #0
+
+; GCN-LABEL: {{^}}test_readcyclecounter:
+; SI-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; VI-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+  %cycle0 = call i64 @llvm.readcyclecounter()
+  store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+  %cycle1 = call i64 @llvm.readcyclecounter()
+  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
index b4ac47afced7..f9292a788521 100644
--- a/test/CodeGen/AMDGPU/reciprocal.ll
+++ b/test/CodeGen/AMDGPU/reciprocal.ll
@@ -2,14 +2,12 @@
 
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0  {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = fdiv float 1.0, %r0
    %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
new file mode 100644
index 000000000000..c902cb9e1dfb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
+; GCN: buffer_store_dwordx2
+define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %a = load i64, i64 addrspace(1)* %in, align 4
+  %and = and i64 %a, 1234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: buffer_store_dword [[VAL]]
+define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %a = load i64, i64 addrspace(1)* %in, align 4
+  %vec = bitcast i64 %a to <2 x i32>
+  %elt0 = extractelement <2 x i32> %vec, i32 0
+  store i32 %elt0, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; GCN: buffer_store_dword [[VAL]]
+define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %a = load i64, i64 addrspace(1)* %in, align 4
+  %vec = bitcast i64 %a to <2 x i32>
+  %elt0 = extractelement <2 x i32> %vec, i32 1
+  store i32 %elt0, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
new file mode 100644
index 000000000000..281e49f804c6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
+; GCN: s_load_dwordx2
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+  %x.bc = bitcast <2 x i32> %x to <4 x i16>
+  store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i32_as_v8i16_align_4:
+; GCN: s_load_dwordx4
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+  %x.bc = bitcast <4 x i32> %x to <8 x i16>
+  store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
+; GCN: s_load_dwordx2
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+  %x.bc = bitcast <2 x i32> %x to <4 x i16>
+  store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i32_as_v2i64_align_4:
+; GCN: s_load_dwordx4
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+  %x.bc = bitcast <4 x i32> %x to <2 x i64>
+  store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+  %x.bc = bitcast <4 x i16> %x to <2 x i32>
+  store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
new file mode 100644
index 000000000000..6e95f4c7521f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -o /dev/null < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o /dev/null < %s
+
+; The register coalescer introduces a verifier error which later
+; results in a crash during scheduling.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+bb:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp0 = icmp eq i32 %id.x, 0
+  br i1 %cmp0, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb
+  %tmp = ashr exact i32 undef, 8
+  br label %bb6
+
+bb4:                                              ; preds = %bb6, %bb
+  %tmp5 = phi <2 x i32> [ zeroinitializer, %bb ], [ %tmp13, %bb6 ]
+  br i1 undef, label %bb15, label %bb16
+
+bb6:                                              ; preds = %bb6, %bb3
+  %tmp7 = phi <2 x i32> [ zeroinitializer, %bb3 ], [ %tmp13, %bb6 ]
+  %tmp8 = add nsw i32 0, %arg1
+  %tmp9 = add nsw i32 %tmp8, 0
+  %tmp10 = sext i32 %tmp9 to i64
+  %tmp11 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg, i64 %tmp10
+  %tmp12 = load <2 x i32>, <2 x i32> addrspace(1)* %tmp11, align 8
+  %tmp13 = add <2 x i32> %tmp12, %tmp7
+  %tmp14 = icmp slt i32 undef, %arg2
+  br i1 %tmp14, label %bb6, label %bb4
+
+bb15:                                             ; preds = %bb4
+  store <2 x i32> %tmp5, <2 x i32> addrspace(3)* undef, align 8
+  br label %bb16
+
+bb16:                                             ; preds = %bb15, %bb4
+  unreachable
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
index 4bb315049be4..bff3a9f5d2b0 100644
--- a/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; SI-LABEL: {{^}}foo:
 ; SI: .section	.AMDGPU.csdata
@@ -9,7 +10,8 @@ declare i32 @llvm.SI.tid() nounwind readnone
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
 define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
-  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
+  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
   %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
   %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
new file mode 100644
index 000000000000..47bdfba96530
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -verify-machineinstrs -o /dev/null %s
+; Check that renameDisconnectedComponents() does not create vregs without a
+; definition on every path (there should at least be IMPLICIT_DEF instructions).
+target triple = "amdgcn--"
+
+define void @func() {
+B0:
+  br i1 undef, label %B1, label %B2
+
+B1:
+  br label %B2
+
+B2:
+  %v0 = phi <4 x float> [ zeroinitializer, %B1 ], [ <float 0.0, float 0.0, float 0.0, float undef>, %B0 ]
+  br i1 undef, label %B20.1, label %B20.2
+
+B20.1:
+  br label %B20.2
+
+B20.2:
+  %v2 = phi <4 x float> [ zeroinitializer, %B20.1 ], [ %v0, %B2 ]
+  br i1 undef, label %B30.1, label %B30.2
+
+B30.1:
+  %sub = fsub <4 x float> %v2, undef
+  br label %B30.2
+
+B30.2:
+  %v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v2, %B20.2 ]
+  %ve0 = extractelement <4 x float> %v3, i32 0
+  store float %ve0, float addrspace(3)* undef, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
new file mode 100644
index 000000000000..2dd21ca51e46
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -0,0 +1,30 @@
+# RUN: llc -march=amdgcn -run-pass rename-independent-subregs -o - %s | FileCheck %s
+--- |
+  define void @test0() { ret void }
+...
+---
+# In the test below we have two independent def+use pairs of subregister1 which
+# can be moved to a new virtual register. The third def of sub1 however is used
+# in combination with sub0 and needs to stay with the original vreg.
+# CHECK-LABEL: name: test0
+# CHECK: S_NOP 0, implicit-def undef %0:sub0
+# CHECK: S_NOP 0, implicit-def undef %2:sub1
+# CHECK: S_NOP 0, implicit %2:sub1
+# CHECK: S_NOP 0, implicit-def undef %1:sub1
+# CHECK: S_NOP 0, implicit %1:sub1
+# CHECK: S_NOP 0, implicit-def %0:sub1
+# CHECK: S_NOP 0, implicit %0
+name: test0
+isSSA: true
+registers:
+  - { id: 0, class: sreg_128 }
+body: |
+  bb.0:
+    S_NOP 0, implicit-def undef %0:sub0
+    S_NOP 0, implicit-def %0:sub1
+    S_NOP 0, implicit %0:sub1
+    S_NOP 0, implicit-def %0:sub1
+    S_NOP 0, implicit %0:sub1
+    S_NOP 0, implicit-def %0:sub1
+    S_NOP 0, implicit %0
+...
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index d5e10d0be883..ad8d00c36393 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -16,10 +16,8 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 }
 
 ; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_read2_b64
+; SI: ds_write2_b64
 ; SI: s_endpgm
 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 2bd9fd6858fe..915c4383ff49 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -1,8 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-attributes #0 = { "ShaderType"="1" }
-
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 ; GCN-LABEL: {{^}}vgpr:
@@ -11,7 +9,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 ; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   %x = fadd float %3, 1.0
   %a = insertvalue {float, float} undef, float %x, 0
@@ -20,15 +18,14 @@ define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32
 }
 
 ; GCN-LABEL: {{^}}vgpr_literal:
-; GCN: v_mov_b32_e32 v4, v0
+; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
+; GCN: s_waitcnt expcnt(0)
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
-; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
-; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
 }
@@ -45,8 +42,8 @@ define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)*
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" }
-define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+attributes #0 = { "InitialPSInputAddr"="0" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -71,7 +68,7 @@ define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrsp
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
   ret float 1.0
 }
 
@@ -85,7 +82,7 @@ define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 in
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
   %f = bitcast <2 x i32> %8 to <2 x float>
   %s = insertvalue {float, <2 x float>} undef, float %14, 0
   %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
@@ -104,8 +101,8 @@ define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* by
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" }
-define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
+attributes #1 = { "InitialPSInputAddr"="1" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -134,8 +131,8 @@ define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrsp
 ; GCN: v_mov_b32_e32 v3, v8
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" }
-define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
+attributes #2 = { "InitialPSInputAddr"="119" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -164,8 +161,8 @@ define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addr
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" }
-define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
+attributes #3 = { "InitialPSInputAddr"="418" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -187,7 +184,7 @@ define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addr
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   %x = add i32 %2, 2
   %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
   %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
@@ -203,7 +200,7 @@ define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   %x = add i32 %2, 2
   ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
 }
@@ -218,7 +215,7 @@ define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i
 ; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   %v = fadd float %3, 1.0
   %s = add i32 %2, 2
@@ -232,14 +229,14 @@ define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval,
 
 
 ; GCN-LABEL: {{^}}structure_literal:
-; GCN: v_mov_b32_e32 v3, v0
+; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
+; GCN: s_waitcnt expcnt(0)
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
 ; GCN-DAG: s_mov_b32 s1, 3
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
-; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
-define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
 }
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
new file mode 100644
index 000000000000..f7380cd96921
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; This should end with an no-op sequence of exec mask manipulations
+; Mask should be in original state after executed unreachable block
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: [[RET_BB]]:
+; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: [[UNREACHABLE_BB]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
+; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: .Lfunc_end0
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+main_body:
+  %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
+  %p87 = fmul float undef, %p83
+  %p88 = fadd float %p87, undef
+  %p93 = fadd float %p88, undef
+  %p97 = fmul float %p93, undef
+  %p102 = fsub float %p97, undef
+  %p104 = fmul float %p102, undef
+  %p106 = fadd float 0.000000e+00, %p104
+  %p108 = fadd float undef, %p106
+  br i1 undef, label %ENDIF69, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %p124 = fmul float %p108, %p108
+  %p125 = fsub float %p124, undef
+  %p126 = fcmp olt float %p125, 0.000000e+00
+  br i1 %p126, label %ENDIF69, label %ELSE41
+
+ELSE41:                                           ; preds = %ELSE
+  unreachable
+
+ENDIF69:                                          ; preds = %ELSE, %main_body
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.floor.f32(float) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
index 6c144cd56ea7..7d2b5538ca33 100644
--- a/test/CodeGen/AMDGPU/rotl.ll
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rotl_i32:
 ; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index b67b800c7374..8192b861b602 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.sqrt.f32(float) nounwind readnone
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
@@ -56,15 +56,15 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
 
 ; SI: s_endpgm
 define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
   %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
 
-  %a = load float, float addrspace(1)* %gep.0
-  %b = load float, float addrspace(1)* %gep.1
-  %c = load float, float addrspace(1)* %gep.2
+  %a = load volatile float, float addrspace(1)* %gep.0
+  %b = load volatile float, float addrspace(1)* %gep.1
+  %c = load volatile float, float addrspace(1)* %gep.2
 
   %x = call float @llvm.sqrt.f32(float %a)
   %y = fmul float %x, %b
diff --git a/test/CodeGen/AMDGPU/runtime-metadata.ll b/test/CodeGen/AMDGPU/runtime-metadata.ll
new file mode 100644
index 000000000000..052ad5b9c15b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/runtime-metadata.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	3
+; CHECK-NEXT: .short	200
+
+; CHECK-LABEL:{{^}}test_char:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"test_char"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	1
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	1
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .ascii	"char"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	1
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_ushort2:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	12
+; CHECK-NEXT: .ascii	"test_ushort2"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	7
+; CHECK-NEXT: .ascii	"ushort2"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	4
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_int3:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"test_int3"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .ascii	"int3"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_int3(<3 x i32> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_ulong4:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	11
+; CHECK-NEXT: .ascii	"test_ulong4"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	32
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	32
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	6
+; CHECK-NEXT: .ascii	"ulong4"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	10
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_half8:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	10
+; CHECK-NEXT: .ascii	"test_half8"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"half8"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	5
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_half8(<8 x half> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_float16:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	12
+; CHECK-NEXT: .ascii	"test_float16"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	64
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	64
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	7
+; CHECK-NEXT: .ascii	"float16"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	8
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_float16(<16 x float> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_double16:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	13
+; CHECK-NEXT: .ascii	"test_double16"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	128
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	128
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .ascii	"double16"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	11
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_double16(<16 x double> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_pointer:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	12
+; CHECK-NEXT: .ascii	"test_pointer"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_image:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	10
+; CHECK-NEXT: .ascii	"test_image"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"image2d_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_sampler:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	12
+; CHECK-NEXT: .ascii	"test_sampler"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"sampler_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	3
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_sampler(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_queue:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	10
+; CHECK-NEXT: .ascii	"test_queue"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	7
+; CHECK-NEXT: .ascii	"queue_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_struct:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	11
+; CHECK-NEXT: .ascii	"test_struct"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .ascii	"struct A"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_struct(%struct.A* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_i128:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"test_i128"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .ascii	"i128"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_i128(i128 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_multi_arg:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	14
+; CHECK-NEXT: .ascii	"test_multi_arg"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	3
+; CHECK-NEXT: .ascii	"int"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	6
+; CHECK-NEXT: .ascii	"short2"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	3
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"char3"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	1
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_addr_space:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	15
+; CHECK-NEXT: .ascii	"test_addr_space"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	3
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, i32 addrspace(2)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_type_qual:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	14
+; CHECK-NEXT: .ascii	"test_type_qual"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	19
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	17
+; CHECK-NEXT: .byte	18
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"int *"
+; CHECK-NEXT: .byte	20
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, i32 addrspace(1)* %b, %opencl.pipe_t addrspace(1)* %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_access_qual:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .ascii	"test_access_qual"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"image1d_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"image2d_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	9
+; CHECK-NEXT: .ascii	"image3d_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	2
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	3
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, %opencl.image2d_t addrspace(1)* %wo, %opencl.image3d_t addrspace(1)* %rw) !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_reqd_wgs_vec_type_hint:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	27
+; CHECK-NEXT: .ascii	"test_reqd_wgs_vec_type_hint"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	3
+; CHECK-NEXT: .ascii	"int"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	21
+; CHECK-NEXT: .long	1
+; CHECK-NEXT: .long	2
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	23
+; CHECK-NEXT: .long	3
+; CHECK-NEXT: .ascii	"int"
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 !reqd_work_group_size !6 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_wgs_hint_vec_type_hint:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	27
+; CHECK-NEXT: .ascii	"test_wgs_hint_vec_type_hint"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	3
+; CHECK-NEXT: .ascii	"int"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	22
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .long	32
+; CHECK-NEXT: .byte	23
+; CHECK-NEXT: .long	5
+; CHECK-NEXT: .ascii	"uint4"
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 !work_group_size_hint !8 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_ptr_to_ptr:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	19
+; CHECK-NEXT: .ascii	"test_arg_ptr_to_ptr"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	6
+; CHECK-NEXT: .ascii	"int **"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 * addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_struct_contains_ptr:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	28
+; CHECK-NEXT: .ascii	"test_arg_struct_contains_ptr"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	4
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .ascii	"struct B"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B * byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_vector_of_ptr:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	22
+; CHECK-NEXT: .ascii	"test_arg_vector_of_ptr"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	16
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	47
+; CHECK-NEXT: .ascii	"global int* __attribute__((ext_vector_type(2)))"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_unknown_builtin_type:
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .byte	6
+; CHECK-NEXT: .long	29
+; CHECK-NEXT: .ascii	"test_arg_unknown_builtin_type"
+; CHECK-NEXT: .byte	7
+; CHECK-NEXT: .byte	9
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	10
+; CHECK-NEXT: .long	8
+; CHECK-NEXT: .byte	11
+; CHECK-NEXT: .long	11
+; CHECK-NEXT: .ascii	"clk_event_t"
+; CHECK-NEXT: .byte	13
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	14
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .byte	16
+; CHECK-NEXT: .byte	0
+; CHECK-NEXT: .byte	15
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .byte	8
+; CHECK-NEXT: .byte	5
+
+define amdgpu_kernel void @test_arg_unknown_builtin_type(%opencl.clk_event_t addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int *"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int *", !"int *", !"int *"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int **"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
diff --git a/test/CodeGen/AMDGPU/rv7x0_count3.ll b/test/CodeGen/AMDGPU/rv7x0_count3.ll
index c3fd923e4593..50df64bf5471 100644
--- a/test/CodeGen/AMDGPU/rv7x0_count3.ll
+++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll
@@ -1,41 +1,52 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
 
 ; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
-
-define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
-   %1 = extractelement <4 x float> %reg1, i32 0
-   %2 = extractelement <4 x float> %reg1, i32 1
-   %3 = extractelement <4 x float> %reg1, i32 2
-   %4 = extractelement <4 x float> %reg1, i32 3
-   %5 = insertelement <4 x float> undef, float %1, i32 0
-   %6 = insertelement <4 x float> %5, float %2, i32 1
-   %7 = insertelement <4 x float> %6, float %3, i32 2
-   %8 = insertelement <4 x float> %7, float %4, i32 3
-   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
-   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
-   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
-   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
-   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
-   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
-   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
-   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
-   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
-   %19 = fadd <4 x float> %9, %10
-   %20 = fadd <4 x float> %19, %11
-   %21 = fadd <4 x float> %20, %12
-   %22 = fadd <4 x float> %21, %13
-   %23 = fadd <4 x float> %22, %14
-   %24 = fadd <4 x float> %23, %15
-   %25 = fadd <4 x float> %24, %16
-   %26 = fadd <4 x float> %25, %17
-   %27 = fadd <4 x float> %26, %18
-   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
-   ret void
+define amdgpu_vs void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
+bb:
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp1 = extractelement <4 x float> %reg1, i32 1
+  %tmp2 = extractelement <4 x float> %reg1, i32 2
+  %tmp3 = extractelement <4 x float> %reg1, i32 3
+  %tmp4 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 1
+  %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 2
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 3
+  %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp10 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp11 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp12 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp13 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp14 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp15 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp16 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 4, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp18 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 5, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp20 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp21 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp22 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp23 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 7, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp24 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp25 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 8, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp26 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 9, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp28 = fadd <4 x float> %tmp9, %tmp11
+  %tmp29 = fadd <4 x float> %tmp28, %tmp13
+  %tmp30 = fadd <4 x float> %tmp29, %tmp15
+  %tmp31 = fadd <4 x float> %tmp30, %tmp17
+  %tmp32 = fadd <4 x float> %tmp31, %tmp19
+  %tmp33 = fadd <4 x float> %tmp32, %tmp21
+  %tmp34 = fadd <4 x float> %tmp33, %tmp23
+  %tmp35 = fadd <4 x float> %tmp34, %tmp25
+  %tmp36 = fadd <4 x float> %tmp35, %tmp27
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp36, i32 0, i32 2)
+  ret void
 }
 
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
-attributes #0 = { "ShaderType"="1" }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll
new file mode 100644
index 000000000000..987056010e69
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_addk_i32_k0:
+; SI: s_load_dword [[VAL:s[0-9]+]]
+; SI: s_addk_i32 [[VAL]], 0x41
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VRESULT]]
+; SI: s_endpgm
+define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, 65
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This should be folded with any number of uses.
+; SI-LABEL: {{^}}s_addk_i32_k0_x2:
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x41
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
+; SI: s_endpgm
+define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+  %add0 = add i32 %a, 65
+  %add1 = add i32 %b, 65
+  store i32 %add0, i32 addrspace(1)* %out0
+  store i32 %add1, i32 addrspace(1)* %out1
+  ret void
+}
+
+; SI-LABEL: {{^}}s_addk_i32_k1:
+; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, 32767 ; (1 << 15) - 1
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_addk_i32_k2:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, -17
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v2i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI: s_endpgm
+define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+  %add = add <2 x i32> %b, <i32 65, i32 66>
+  store <2 x i32> %add, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v4i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
+; SI: s_endpgm
+define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+  %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
+  store <4 x i32> %add, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v8i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x45
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x46
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
+; SI: s_endpgm
+define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+  %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
+  store <8 x i32> %add, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}no_s_addk_i32_k0:
+; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
+; SI: s_endpgm
+define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, 32768 ; 1 << 15
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll
new file mode 100644
index 000000000000..33d7eeacdb83
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_mulk_i32_k0:
+; SI: s_load_dword [[VAL:s[0-9]+]]
+; SI: s_mulk_i32 [[VAL]], 0x41
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VRESULT]]
+; SI: s_endpgm
+define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+  %mul = mul i32 %b, 65
+  store i32 %mul, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_mulk_i32_k1:
+; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
+; SI: s_endpgm
+define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+  %mul = mul i32 %b, 32767 ; (1 << 15) - 1
+  store i32 %mul, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_mulk_i32_k2:
+; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_endpgm
+define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+  %mul = mul i32 %b, -17
+  store i32 %mul, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}no_s_mulk_i32_k0:
+; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
+; SI: s_endpgm
+define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+  %mul = mul i32 %b, 32769 ; 1 << 15 + 1
+  store i32 %mul, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 551f34339a12..52f3cceac2a0 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
 
 ; In this test both the pointer and the offset operands to the
 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
@@ -26,8 +26,8 @@ declare i32 @llvm.r600.read.tidig.y() #0
 
 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x()
-  %tmp1 = call i32 @llvm.r600.read.tidig.y()
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
   %tmp2 = sext i32 %tmp to i64
   %tmp3 = sext i32 %tmp1 to i64
   br label %loop
@@ -51,12 +51,20 @@ done:                                             ; preds = %loop
 }
 
 ; Test moving an SMRD instruction to the VALU
+; FIXME: movs can be moved before nop to reduce count
 
 ; GCN-LABEL: {{^}}smrd_valu:
-; FIXME: We should be using flat load for HSA.
-; GCN: buffer_load_dword [[OUT:v[0-9]+]]
-; GCN-NOHSA: buffer_store_dword [[OUT]]
-; GCN-HSA: flat_store_dword [[OUT]]
+; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
+; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_nop 3
+; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
+; SI: s_mov_b32
+
+; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
+; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
+; GCN-NOHSA: buffer_store_dword [[V_OUT]]
+; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
@@ -87,7 +95,7 @@ endif:                                            ; preds = %else, %if
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
   %tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -104,10 +112,10 @@ entry:
 ; GCN-NOHSA: v_add_i32_e32
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
 define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
   %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000
   %tmp4 = load i32, i32 addrspace(2)* %tmp3
@@ -127,7 +135,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
   %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000
   %tmp4 = load i64, i64 addrspace(2)* %tmp3
@@ -149,7 +157,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
   %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234
   %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3
@@ -185,7 +193,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
   %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3
@@ -196,22 +204,14 @@ entry:
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
 
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
 
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@@ -234,7 +234,7 @@ entry:
 ; GCN: s_endpgm
 define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
   %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234
   %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3
@@ -248,10 +248,10 @@ entry:
 ; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
-; GCN-HSA: flat_store_dword [[ADD]]
+; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
 define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
   %tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -265,7 +265,7 @@ entry:
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
 define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255
   %tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -279,7 +279,7 @@ entry:
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256
   %tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -294,7 +294,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
+  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
   %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
@@ -317,7 +317,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
+  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
   %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
   %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
@@ -389,7 +389,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
-  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
   %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
@@ -431,5 +431,33 @@ entry:
   ret void
 }
 
+; Make sure we legalize vopc operands after moving an sopc to the value.
+
+; {{^}}sopc_vopc_legalize_bug:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN-NOHSA: buffer_store_dword [[ONE]]
+; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
+; GCN; {{^}}[[EXIT]]:
+; GCN: s_endpgm
+define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+bb3:                                              ; preds = %bb2
+  %tmp0 = bitcast i32 %cond to float
+  %tmp1 = fadd float %tmp0, 2.500000e-01
+  %tmp2 = bitcast float %tmp1 to i32
+  %tmp3 = icmp ult i32 %tmp2, %cond
+  br i1 %tmp3, label %bb6, label %bb7
+
+bb6:
+  store i32 1, i32 addrspace(1)* %out
+  br label %bb7
+
+bb7:                                              ; preds = %bb3
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 0970e5d30630..55b392a32729 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,15 +1,14 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-
+; XXX - Why the packing?
 ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
+; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
+; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
+; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
+; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
 define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
@@ -21,11 +20,7 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
 ; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
+; SI: buffer_store_dwordx2
 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
index 11e8f5176f44..e040639a2d94 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -44,15 +44,15 @@ ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
   %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
   %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
   %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
-  %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
-  %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
-  %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %27 = call float @llvm.AMDGPU.clamp.f32(float %temp.0, float 0.000000e+00, float 1.000000e+00)
+  %28 = call float @llvm.AMDGPU.clamp.f32(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
+  %29 = call float @llvm.AMDGPU.clamp.f32(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
+  %30 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
   %31 = insertelement <4 x float> undef, float %27, i32 0
   %32 = insertelement <4 x float> %31, float %28, i32 1
   %33 = insertelement <4 x float> %32, float %29, i32 2
   %34 = insertelement <4 x float> %33, float %30, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %34, i32 0, i32 0)
   ret void
 
 ELSE17:                                           ; preds = %ELSE
@@ -74,9 +74,8 @@ ELSE17:                                           ; preds = %ELSE
   br label %ENDIF
 }
 
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readnone }
-attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 759197ca61f7..f907e154f962 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -43,15 +43,15 @@ LOOP:                                             ; preds = %IF31, %main_body
   br i1 %29, label %IF, label %LOOP29
 
 IF:                                               ; preds = %LOOP
-  %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %30 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %31 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
   %34 = insertelement <4 x float> undef, float %30, i32 0
   %35 = insertelement <4 x float> %34, float %31, i32 1
   %36 = insertelement <4 x float> %35, float %32, i32 2
   %37 = insertelement <4 x float> %36, float %33, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %37, i32 0, i32 0)
   ret void
 
 LOOP29:                                           ; preds = %LOOP, %ENDIF30
@@ -81,8 +81,8 @@ ENDIF30:                                          ; preds = %LOOP29
   br label %LOOP29
 }
 
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
index 28cc08abc022..5839785f00d5 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -30,15 +30,15 @@ LOOP:                                             ; preds = %ENDIF, %main_body
   br i1 %16, label %IF, label %ENDIF
 
 IF:                                               ; preds = %LOOP
-  %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %17 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %18 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %19 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %20 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
   %21 = insertelement <4 x float> undef, float %17, i32 0
   %22 = insertelement <4 x float> %21, float %18, i32 1
   %23 = insertelement <4 x float> %22, float %19, i32 2
   %24 = insertelement <4 x float> %23, float %20, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %24, i32 0, i32 0)
   ret void
 
 ENDIF:                                            ; preds = %LOOP
@@ -48,8 +48,8 @@ ENDIF:                                            ; preds = %LOOP
   br label %LOOP
 }
 
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 3f728fd873b3..1bf109dec032 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,21 +1,19 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
 
-declare i32 @llvm.r600.read.tidig.x() #1
-
 ; FIXME: This currently doesn't do a great job of clustering the
 ; loads, which end up with extra moves between them. Right now, it
 ; seems the only things areLoadsFromSameBasePtr is accomplishing is
 ; ordering the loads so that the lower address loads come first.
 
 ; FUNC-LABEL: {{^}}cluster_global_arg_loads:
-; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
 define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
   store i32 %load0, i32 addrspace(1)* %out0, align 4
   store i32 %load1, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
index 549465096833..aa67b2e0f7db 100644
--- a/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -66,7 +66,7 @@ ENDIF:                                            ; preds = %IF23, %ELSE, %IF
   %45 = insertelement <4 x float> %44, float %temp5.0, i32 1
   %46 = insertelement <4 x float> %45, float %temp6.0, i32 2
   %47 = insertelement <4 x float> %46, float %temp7.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %47, i32 0, i32 0)
   ret void
 
 IF23:                                             ; preds = %ELSE
@@ -89,6 +89,6 @@ IF23:                                             ; preds = %ELSE
 
 declare float @fabs(float) #0
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readonly }
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
index 94c653c8f25b..6637b3897717 100644
--- a/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -32,7 +32,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
   %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
   %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %19, i32 0, i32 0)
   ret void
 
 IF13:                                             ; preds = %ELSE
@@ -43,4 +43,4 @@ IF13:                                             ; preds = %ELSE
   br label %ENDIF
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 6b3e0814c380..886d4a1dcb5c 100644
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -1,18 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}cluster_arg_loads:
+; FIXME: Due to changes in the load clustering heuristics.  We no longer
+;        cluster all argument loads together on SI.
+; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
 define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index e4b16c0a165f..9b490bb3a731 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,13 +1,10 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
 
-declare void @llvm.AMDGPU.barrier.local() nounwind convergent
+declare void @llvm.amdgcn.s.barrier() nounwind convergent
 
-
-; SI-LABEL: {{^}}main(
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+; GCN-LABEL: {{^}}main:
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 2
@@ -39,63 +36,63 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
   %15 = extractelement <4 x float> %reg1, i32 1
   %16 = extractelement <4 x float> %reg1, i32 3
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, <4 x float> addrspace(2)* null
   %18 = extractelement <4 x float> %17, i32 0
   %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %20 = load <4 x float>, <4 x float> addrspace(2)* null
   %21 = extractelement <4 x float> %20, i32 1
   %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %23 = load <4 x float>, <4 x float> addrspace(2)* null
   %24 = extractelement <4 x float> %23, i32 2
   %25 = fmul float %24, %0
-  %26 = load <4 x float>, <4 x float> addrspace(9)* null
+  %26 = load <4 x float>, <4 x float> addrspace(2)* null
   %27 = extractelement <4 x float> %26, i32 3
   %28 = fmul float %27, %0
-  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %30 = extractelement <4 x float> %29, i32 0
   %31 = fmul float %30, %15
   %32 = fadd float %31, %19
-  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %34 = extractelement <4 x float> %33, i32 1
   %35 = fmul float %34, %15
   %36 = fadd float %35, %22
-  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %38 = extractelement <4 x float> %37, i32 2
   %39 = fmul float %38, %15
   %40 = fadd float %39, %25
-  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
   %42 = extractelement <4 x float> %41, i32 3
   %43 = fmul float %42, %15
   %44 = fadd float %43, %28
-  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %46 = extractelement <4 x float> %45, i32 0
   %47 = fmul float %46, %1
   %48 = fadd float %47, %32
-  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %50 = extractelement <4 x float> %49, i32 1
   %51 = fmul float %50, %1
   %52 = fadd float %51, %36
-  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %54 = extractelement <4 x float> %53, i32 2
   %55 = fmul float %54, %1
   %56 = fadd float %55, %40
-  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
   %58 = extractelement <4 x float> %57, i32 3
   %59 = fmul float %58, %1
   %60 = fadd float %59, %44
-  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %62 = extractelement <4 x float> %61, i32 0
   %63 = fmul float %62, %16
   %64 = fadd float %63, %48
-  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %66 = extractelement <4 x float> %65, i32 1
   %67 = fmul float %66, %16
   %68 = fadd float %67, %52
-  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %70 = extractelement <4 x float> %69, i32 2
   %71 = fmul float %70, %16
   %72 = fadd float %71, %56
-  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
   %74 = extractelement <4 x float> %73, i32 3
   %75 = fmul float %74, %16
   %76 = fadd float %75, %60
@@ -103,12 +100,12 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %78 = insertelement <4 x float> %77, float %68, i32 1
   %79 = insertelement <4 x float> %78, float %72, i32 2
   %80 = insertelement <4 x float> %79, float %76, i32 3
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.amdgcn.s.barrier()
   %81 = insertelement <4 x float> undef, float %temp.0, i32 0
   %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
   %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
   %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
-  call void @llvm.AMDGPU.barrier.local()
+  call void @llvm.amdgcn.s.barrier()
   ret void
 
 LOOP:                                             ; preds = %main_body, %Flow
@@ -159,5 +156,3 @@ ENDIF19:                                          ; preds = %ENDIF16
   %115 = fadd float %temp4.0, 1.000000e+00
   br label %Flow1
 }
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
index 8d980dbf8995..00d4ba66913d 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
 ;REQUIRES: asserts
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -85,12 +85,12 @@ ENDIF:                                            ; preds = %ENDIF16, %LOOP, %ma
   %72 = insertelement <4 x float> %71, float %62, i32 1
   %73 = insertelement <4 x float> %72, float %66, i32 2
   %74 = insertelement <4 x float> %73, float %70, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %74, i32 60, i32 1)
   %75 = insertelement <4 x float> undef, float %temp.0, i32 0
   %76 = insertelement <4 x float> %75, float %temp1.0, i32 1
   %77 = insertelement <4 x float> %76, float %temp2.0, i32 2
   %78 = insertelement <4 x float> %77, float %temp3.0, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %78, i32 0, i32 2)
   ret void
 
 LOOP:                                             ; preds = %main_body, %ENDIF19
@@ -127,6 +127,4 @@ ENDIF19:                                          ; preds = %ENDIF16
   br label %LOOP
 }
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index d43de4766057..a66f074123c1 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
 
 ; When a frame index offset is more than 12-bits, make sure we don't store
 ; it in mubuf's offset field.
@@ -49,8 +47,8 @@ done:
 
 }
 
-; GCN-LABEL: {{^}}legal_offset_fi_offset
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; GCN-LABEL: {{^}}legal_offset_fi_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 ; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
@@ -87,11 +85,8 @@ done:
   ret void
 }
 
-; GCN-LABEL: @neg_vaddr_offset
-; We can't prove %offset is positive, so we must do the computation with the
-; immediate in an add instruction instead of folding offset and the immediate into
-; the store instruction.
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}neg_vaddr_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
 define void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
@@ -101,9 +96,8 @@ entry:
   ret void
 }
 
-; GCN-LABEL: @pos_vaddr_offse
-; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
-; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}pos_vaddr_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
 define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index de645353a401..29d893414c07 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
@@ -34,8 +34,8 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; working.
 
 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
 ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
 ; SI: v_add_i32
 ; SI: v_lshrrev_b32
@@ -82,6 +82,60 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_sdiv_i8:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = sdiv i8 %num, %den
+  %result.ext = sext i8 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i23:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+  %num = load i23, i23 addrspace(1) * %in
+  %den = load i23, i23 addrspace(1) * %den_ptr
+  %result = sdiv i23 %num, %den
+  %result.ext = sext i23 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i24:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+  %num = load i24, i24 addrspace(1) * %in
+  %den = load i24, i24 addrspace(1) * %den_ptr
+  %result = sdiv i24 %num, %den
+  %result.ext = sext i24 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i25:
+; SI-NOT: v_rcp_f32
+define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+  %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
+  %num = load i25, i25 addrspace(1) * %in
+  %den = load i25, i25 addrspace(1) * %den_ptr
+  %result = sdiv i25 %num, %den
+  %result.ext = sext i25 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
 ; Tests for 64-bit divide bypass.
 ; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
index ad5df39f5505..ccabd3c2a969 100644
--- a/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
@@ -181,13 +181,13 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}srem25_i32:
+; FUNC-LABEL: {{^}}no_srem25_i32:
 ; SI-NOT: v_cvt_f32_i32
 ; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -200,40 +200,138 @@ define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32:
 ; SI-NOT: v_cvt_f32_i32
 ; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
-  %den.i24.0 = shl i32 %den, 7
+  %den.i25.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
-  %den.i24 = ashr i32 %den.i24.0, 7
-  %result = srem i32 %num.i24, %den.i24
+  %den.i25 = ashr i32 %den.i25.0, 7
+  %result = sdiv i32 %num.i24, %den.i25
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32:
 ; SI-NOT: v_cvt_f32_i32
 ; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
+  %num.i25.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
-  %num.i24 = ashr i32 %num.i24.0, 7
+  %num.i25 = ashr i32 %num.i25.0, 7
   %den.i24 = ashr i32 %den.i24.0, 8
-  %result = srem i32 %num.i24, %den.i24
+  %result = sdiv i32 %num.i25, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i25.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i25 = ashr i32 %den.i25.0, 7
+  %result = srem i32 %num.i24, %den.i25
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i25.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i25 = ashr i32 %num.i25.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i25, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i24_i11_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i11.0 = shl i32 %den, 21
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i11 = ashr i32 %den.i11.0, 21
+  %result = srem i32 %num.i24, %den.i11
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i11_i24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i11.0 = shl i32 %num, 21
+  %den.i24.0 = shl i32 %den, 8
+  %num.i11 = ashr i32 %num.i11.0, 21
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i11, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i17_i12_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i17.0 = shl i32 %num, 15
+  %den.i12.0 = shl i32 %den, 20
+  %num.i17 = ashr i32 %num.i17.0, 15
+  %den.i12 = ashr i32 %den.i12.0, 20
+  %result = sdiv i32 %num.i17, %den.i12
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
index a9b2b7f9df55..a7ce948acd4f 100644
--- a/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -1,8 +1,8 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-;FUNC-LABEL: {{^}}test_sdiv:
+;FUNC-LABEL: {{^}}s_test_sdiv:
 ;EG: RECIP_UINT
 ;EG: LSHL {{.*}}, 1,
 ;EG: BFE_UINT
@@ -36,47 +36,47 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN-NOT: v_mad_f32
+; SI-NOT: v_lshr_b64
+; VI-NOT: v_lshrrev_b64
+; GCN: s_endpgm
+define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_srem:
+;FUNC-LABEL: {{^}}s_test_srem:
 ;EG: RECIP_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
@@ -144,7 +144,7 @@ define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
index 6735394e93a9..2406831b94c5 100644
--- a/test/CodeGen/AMDGPU/select-i1.ll
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
@@ -13,3 +13,15 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_minmax_i1:
+; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45
+; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[COND]]
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+  %cmp = icmp slt i1 %cond, false
+  %sel = select i1 %cmp, i1 %a, i1 %b
+  store i1 %sel, i1 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 94758ad84c18..faf8d8a12c25 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; Test expansion of scalar selects on vectors.
@@ -29,30 +29,50 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v2i32:
+; FIXME: Expansion with bitwise operations may be better if doing a
+; vector select with SGPR inputs.
+
+; FUNC-LABEL: {{^}}s_select_v2i32:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v4i32:
+; FUNC-LABEL: {{^}}s_select_v4i32:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
-define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_select_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_cmp_gt_u32_e64 vcc, 32, s{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+bb:
+  %tmp2 = icmp ult i32 %cond, 32
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}select_v8i32:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
@@ -69,24 +89,61 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v2f32:
+; FUNC-LABEL: {{^}}s_select_v2f32:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
+; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v4f32:
+; FUNC-LABEL: {{^}}s_select_v4f32:
+; SI: s_load_dwordx4
+; SI: s_load_dwordx4
+; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+
 ; SI: buffer_store_dwordx4
-define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_select_v4f32:
+; SI: buffer_load_dwordx4
+; SI: v_cmp_gt_u32_e64 vcc, 32, s{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+bb:
+  %tmp2 = icmp ult i32 %cond, 32
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in
+  %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
+  store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}select_v8f32:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
@@ -154,3 +211,9 @@ define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll
new file mode 100644
index 000000000000..37f2747d9815
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selected-stack-object.ll
@@ -0,0 +1,15 @@
+; "Assertion failure" should be caught with both XFAIL:* and +Asserts.
+; XFAIL: *
+; REQUIRES: asserts
+
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+
+; See also local-stack-slot-bug.ll
+; This fails because a stack object is created during instruction selection.
+
+; CHECK-LABEL: {{^}}main:
+define amdgpu_ps float @main(i32 %idx) {
+main_body:
+  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+  ret float %v1
+}
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index 63d74820f961..d2c57a810c2c 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
@@ -36,34 +36,30 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
   ret void
 }
 
-; This really folds away to false
-; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1:
+; GCN-NOT: v_cmp
 ; GCN: v_cmp_eq_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp eq i32 %ext, 1
+  %icmp1 = icmp eq i32 %ext, -1
   store i1 %icmp1, i1 addrspace(1)* %out
   ret void
 }
 
-; This really folds away to true
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 1
+  %icmp1 = icmp ne i32 %ext, -1
   store i1 %icmp1, i1 addrspace(1)* %out
   ret void
 }
@@ -123,20 +119,28 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
   ret void
 }
 
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+; Reduces to false:
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_neg1:
+; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, -1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; Reduces to true:
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_neg1:
+; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
-  %ext = sext i1 %icmp0 to i32
-  %icmp1 = icmp ne i32 %ext, 2
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, -1
   store i1 %icmp1, i1 addrspace(1)* %out
   ret void
 }
@@ -145,10 +149,10 @@ define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
-; GCN: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
-; GCN: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
+; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
 ; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK255]]
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
@@ -162,7 +166,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: buffer_load_sbyte [[B:v[0-9]+]]
 ; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
   %b = load i8, i8 addrspace(1)* %b.ptr
@@ -193,10 +197,10 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
-; GCN: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
-; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
@@ -240,3 +244,40 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
   store i1 %icmp1, i1 addrspace(1)* %out
   ret void
 }
+
+; FIXME: These cases should really be able fold to true/false in
+; DAGCombiner
+
+; This really folds away to false
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index f33a82df5ffb..c89e712e4cb0 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -375,3 +375,37 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
   store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
   ret void
 }
+
+; Make sure we don't try to emit i1 setcc ops
+; FUNC-LABEL: setcc-i1
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
+; SI: s_cmp_eq_i32 [[AND]], 0
+define void @setcc-i1(i32 %in) {
+  %and = and i32 %in, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %endif, label %if
+if:
+  unreachable
+endif:
+  ret void
+}
+
+; FUNC-LABEL: setcc-i1-and-xor
+; SI-DAG: v_cmp_le_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: v_cmp_ge_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], 1.0, s{{[0-9]+}}
+; SI: s_and_b64 s[2:3], [[A]], [[B]]
+define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+bb0:
+  %tmp5 = fcmp oge float %cond, 0.000000e+00
+  %tmp7 = fcmp ole float %cond, 1.000000e+00
+  %tmp9 = and i1 %tmp5, %tmp7
+  %tmp11 = xor i1 %tmp9, 1
+  br i1 %tmp11, label %bb2, label %bb1
+
+bb1:
+  store i32 0, i32 addrspace(1)* %out
+  br label %bb2
+
+bb2:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
index 231be7aa3da7..15db03cf906e 100644
--- a/test/CodeGen/AMDGPU/setcc64.ll
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -59,7 +59,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}f64_one:
 ; SI: v_cmp_lg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -80,7 +80,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}f64_ueq:
 ; SI: v_cmp_nlg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -92,7 +92,7 @@ entry:
 ; FUNC-LABEL: {{^}}f64_ugt:
 
 ; SI: v_cmp_nle_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -103,7 +103,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}f64_uge:
 ; SI: v_cmp_nlt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -114,7 +114,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}f64_ult:
 ; SI: v_cmp_nge_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -125,7 +125,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}f64_ule:
 ; SI: v_cmp_ngt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
diff --git a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
new file mode 100644
index 000000000000..08bdc3aba555
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -0,0 +1,22 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s
+; XUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+;
+; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+
+; Works with the align 2 removed
+define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b
+  %x = shl <2 x i32> %c, <i32 6, i32 6>
+  %y = ashr <2 x i32> %x, <i32 7, i32 7>
+  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 23ae3b967971..a6c72a5165d6 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
@@ -95,17 +95,6 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
 define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
@@ -121,16 +110,6 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
 define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
@@ -145,17 +124,6 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG-NOT: BFE_INT
-
-; EG: ASHR [[RES_HI]]
-
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
 define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
@@ -300,7 +268,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
-  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+  store <2 x i32> %y, <2 x i32> addrspace(1)* %out
   ret void
 }
 
@@ -458,7 +426,8 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
-  %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp2.5 = icmp sgt i32 %tmp2, 0
+  %tmp3 = select i1 %tmp2.5, i32 %tmp2, i32 0
   %tmp4 = trunc i32 %tmp3 to i8
   %tmp6 = sext i8 %tmp4 to i16
   store i16 %tmp6, i16 addrspace(1)* %out, align 2
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 38289ced632a..f1b8e8eec85d 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 ;
 ;
 ; Most SALU instructions ignore control flow, so we need to make sure
@@ -40,7 +40,7 @@ endif:
 
 define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid_f = uitofp i32 %tid to float
   %tmp1 = fcmp ueq float %tid_f, 0.0
   br i1 %tmp1, label %if, label %else
@@ -67,7 +67,7 @@ endif:
 ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
 
-; SI: BB2_1:
+; SI: BB2_2:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
 ; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
@@ -77,7 +77,7 @@ endif:
 ; SI: buffer_store_dword [[RESULT]]
 define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp eq i32 %tid, 0
   br i1 %tmp1, label %if, label %else
 
@@ -100,6 +100,6 @@ endif:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index b849c4038bc7..da270c533ece 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -3,189 +3,193 @@
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
+
+
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-
-define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
-  %25 = fptosi float %23 to i32
-  %26 = icmp ne i32 %25, 0
-  br i1 %26, label %ENDIF, label %ELSE
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
+  %tmp24 = fptosi float %tmp22 to i32
+  %tmp25 = icmp ne i32 %tmp24, 0
+  br i1 %tmp25, label %ENDIF, label %ELSE
 
 ELSE:                                             ; preds = %main_body
-  %27 = fsub float -0.000000e+00, %22
+  %tmp26 = fsub float -0.000000e+00, %tmp21
   br label %ENDIF
 
-ENDIF:                                            ; preds = %main_body, %ELSE
-  %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
-  %28 = fadd float %temp.0, %24
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
+ENDIF:                                            ; preds = %ELSE, %main_body
+  %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ]
+  %tmp27 = fadd float %temp.0, %tmp23
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00)
   ret void
 }
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
-  %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
-  %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
-  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
-  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
-  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
-  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
-  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
-  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
-  %46 = bitcast float %41 to i32
-  %47 = bitcast float %42 to i32
-  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
-  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
-  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
-  %51 = extractelement <4 x float> %50, i32 2
-  %52 = call float @fabs(float %51)
-  %53 = fmul float %43, %43
-  %54 = fmul float %44, %44
-  %55 = fadd float %54, %53
-  %56 = fmul float %45, %45
-  %57 = fadd float %55, %56
-  %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
-  %59 = fmul float %43, %58
-  %60 = fmul float %44, %58
-  %61 = fmul float %45, %58
-  %62 = fmul float %59, %23
-  %63 = fmul float %60, %24
-  %64 = fadd float %63, %62
-  %65 = fmul float %61, %25
-  %66 = fadd float %64, %65
-  %67 = fsub float -0.000000e+00, %26
-  %68 = fmul float %66, %52
-  %69 = fadd float %68, %67
-  %70 = fmul float %27, %69
-  %71 = fmul float %28, %69
-  %72 = call float @fabs(float %70)
-  %73 = fcmp olt float 0x3EE4F8B580000000, %72
-  %74 = sext i1 %73 to i32
-  %75 = bitcast i32 %74 to float
-  %76 = bitcast float %75 to i32
-  %77 = icmp ne i32 %76, 0
-  br i1 %77, label %IF, label %ENDIF
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 36)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 40)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 48)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 52)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 56)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 64)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 68)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 72)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 76)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 80)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 84)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 88)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 92)
+  %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
+  %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
+  %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
+  %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0
+  %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %tmp41 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %tmp42 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg3, <2 x i32> %arg5)
+  %tmp43 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg3, <2 x i32> %arg5)
+  %tmp44 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg3, <2 x i32> %arg5)
+  %tmp45 = bitcast float %tmp40 to i32
+  %tmp46 = bitcast float %tmp41 to i32
+  %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
+  %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1
+  %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32>
+  %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp50 = extractelement <4 x float> %tmp49, i32 2
+  %tmp51 = call float @fabs(float %tmp50)
+  %tmp52 = fmul float %tmp42, %tmp42
+  %tmp53 = fmul float %tmp43, %tmp43
+  %tmp54 = fadd float %tmp53, %tmp52
+  %tmp55 = fmul float %tmp44, %tmp44
+  %tmp56 = fadd float %tmp54, %tmp55
+  %tmp57 = call float @llvm.amdgcn.rsq.f32(float %tmp56)
+  %tmp58 = fmul float %tmp42, %tmp57
+  %tmp59 = fmul float %tmp43, %tmp57
+  %tmp60 = fmul float %tmp44, %tmp57
+  %tmp61 = fmul float %tmp58, %tmp22
+  %tmp62 = fmul float %tmp59, %tmp23
+  %tmp63 = fadd float %tmp62, %tmp61
+  %tmp64 = fmul float %tmp60, %tmp24
+  %tmp65 = fadd float %tmp63, %tmp64
+  %tmp66 = fsub float -0.000000e+00, %tmp25
+  %tmp67 = fmul float %tmp65, %tmp51
+  %tmp68 = fadd float %tmp67, %tmp66
+  %tmp69 = fmul float %tmp26, %tmp68
+  %tmp70 = fmul float %tmp27, %tmp68
+  %tmp71 = call float @fabs(float %tmp69)
+  %tmp72 = fcmp olt float 0x3EE4F8B580000000, %tmp71
+  %tmp73 = sext i1 %tmp72 to i32
+  %tmp74 = bitcast i32 %tmp73 to float
+  %tmp75 = bitcast float %tmp74 to i32
+  %tmp76 = icmp ne i32 %tmp75, 0
+  br i1 %tmp76, label %IF, label %ENDIF
 
 IF:                                               ; preds = %main_body
-  %78 = fsub float -0.000000e+00, %70
-  %79 = call float @llvm.AMDIL.exp.(float %78)
-  %80 = fsub float -0.000000e+00, %79
-  %81 = fadd float 1.000000e+00, %80
-  %82 = fdiv float 1.000000e+00, %70
-  %83 = fmul float %81, %82
-  %84 = fmul float %32, %83
+  %tmp77 = fsub float -0.000000e+00, %tmp69
+  %tmp78 = call float @llvm.exp2.f32(float %tmp77)
+  %tmp79 = fsub float -0.000000e+00, %tmp78
+  %tmp80 = fadd float 1.000000e+00, %tmp79
+  %tmp81 = fdiv float 1.000000e+00, %tmp69
+  %tmp82 = fmul float %tmp80, %tmp81
+  %tmp83 = fmul float %tmp31, %tmp82
   br label %ENDIF
 
-ENDIF:                                            ; preds = %main_body, %IF
-  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
-  %85 = call float @fabs(float %71)
-  %86 = fcmp olt float 0x3EE4F8B580000000, %85
-  %87 = sext i1 %86 to i32
-  %88 = bitcast i32 %87 to float
-  %89 = bitcast float %88 to i32
-  %90 = icmp ne i32 %89, 0
-  br i1 %90, label %IF25, label %ENDIF24
+ENDIF:                                            ; preds = %IF, %main_body
+  %temp4.0 = phi float [ %tmp83, %IF ], [ %tmp31, %main_body ]
+  %tmp84 = call float @fabs(float %tmp70)
+  %tmp85 = fcmp olt float 0x3EE4F8B580000000, %tmp84
+  %tmp86 = sext i1 %tmp85 to i32
+  %tmp87 = bitcast i32 %tmp86 to float
+  %tmp88 = bitcast float %tmp87 to i32
+  %tmp89 = icmp ne i32 %tmp88, 0
+  br i1 %tmp89, label %IF25, label %ENDIF24
 
 IF25:                                             ; preds = %ENDIF
-  %91 = fsub float -0.000000e+00, %71
-  %92 = call float @llvm.AMDIL.exp.(float %91)
-  %93 = fsub float -0.000000e+00, %92
-  %94 = fadd float 1.000000e+00, %93
-  %95 = fdiv float 1.000000e+00, %71
-  %96 = fmul float %94, %95
-  %97 = fmul float %36, %96
+  %tmp90 = fsub float -0.000000e+00, %tmp70
+  %tmp91 = call float @llvm.exp2.f32(float %tmp90)
+  %tmp92 = fsub float -0.000000e+00, %tmp91
+  %tmp93 = fadd float 1.000000e+00, %tmp92
+  %tmp94 = fdiv float 1.000000e+00, %tmp70
+  %tmp95 = fmul float %tmp93, %tmp94
+  %tmp96 = fmul float %tmp35, %tmp95
   br label %ENDIF24
 
-ENDIF24:                                          ; preds = %ENDIF, %IF25
-  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
-  %98 = fmul float %29, %temp4.0
-  %99 = fmul float %30, %temp4.0
-  %100 = fmul float %31, %temp4.0
-  %101 = fmul float %33, %temp8.0
-  %102 = fadd float %101, %98
-  %103 = fmul float %34, %temp8.0
-  %104 = fadd float %103, %99
-  %105 = fmul float %35, %temp8.0
-  %106 = fadd float %105, %100
-  %107 = call float @llvm.pow.f32(float %52, float %22)
-  %108 = fsub float -0.000000e+00, %102
-  %109 = fmul float %108, %107
-  %110 = fsub float -0.000000e+00, %104
-  %111 = fmul float %110, %107
-  %112 = fsub float -0.000000e+00, %106
-  %113 = fmul float %112, %107
-  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
-  %115 = bitcast i32 %114 to float
-  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
-  %117 = bitcast i32 %116 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+ENDIF24:                                          ; preds = %IF25, %ENDIF
+  %temp8.0 = phi float [ %tmp96, %IF25 ], [ %tmp35, %ENDIF ]
+  %tmp97 = fmul float %tmp28, %temp4.0
+  %tmp98 = fmul float %tmp29, %temp4.0
+  %tmp99 = fmul float %tmp30, %temp4.0
+  %tmp100 = fmul float %tmp32, %temp8.0
+  %tmp101 = fadd float %tmp100, %tmp97
+  %tmp102 = fmul float %tmp33, %temp8.0
+  %tmp103 = fadd float %tmp102, %tmp98
+  %tmp104 = fmul float %tmp34, %temp8.0
+  %tmp105 = fadd float %tmp104, %tmp99
+  %tmp106 = call float @llvm.pow.f32(float %tmp51, float %tmp21)
+  %tmp107 = fsub float -0.000000e+00, %tmp101
+  %tmp108 = fmul float %tmp107, %tmp106
+  %tmp109 = fsub float -0.000000e+00, %tmp103
+  %tmp110 = fmul float %tmp109, %tmp106
+  %tmp111 = fsub float -0.000000e+00, %tmp105
+  %tmp112 = fmul float %tmp111, %tmp106
+  %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
+  %tmp114 = bitcast i32 %tmp113 to float
+  %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
+  %tmp116 = bitcast i32 %tmp115 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116)
   ret void
 }
 
 ; We just want ot make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-
-define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
-  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
-  %26 = fptosi float %25 to i32
-  %27 = bitcast i32 %26 to float
-  %28 = bitcast float %27 to i32
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 4)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 8)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 12)
+  %tmp25 = fptosi float %tmp24 to i32
+  %tmp26 = bitcast i32 %tmp25 to float
+  %tmp27 = bitcast float %tmp26 to i32
   br label %LOOP
 
 LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
-  %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
-  %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
-  %29 = bitcast float %temp8.0 to i32
-  %30 = icmp sge i32 %29, %28
-  %31 = sext i1 %30 to i32
-  %32 = bitcast i32 %31 to float
-  %33 = bitcast float %32 to i32
-  %34 = icmp ne i32 %33, 0
-  br i1 %34, label %IF, label %ENDIF
+  %temp4.0 = phi float [ %tmp21, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %tmp22, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %tmp23, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %tmp36, %ENDIF ]
+  %tmp28 = bitcast float %temp8.0 to i32
+  %tmp29 = icmp sge i32 %tmp28, %tmp27
+  %tmp30 = sext i1 %tmp29 to i32
+  %tmp31 = bitcast i32 %tmp30 to float
+  %tmp32 = bitcast float %tmp31 to i32
+  %tmp33 = icmp ne i32 %tmp32, 0
+  br i1 %tmp33, label %IF, label %ENDIF
 
 IF:                                               ; preds = %LOOP
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
   ret void
 
 ENDIF:                                            ; preds = %LOOP
-  %35 = bitcast float %temp8.0 to i32
-  %36 = add i32 %35, 1
-  %37 = bitcast i32 %36 to float
+  %tmp34 = bitcast float %temp8.0 to i32
+  %tmp35 = add i32 %tmp34, 1
+  %tmp36 = bitcast i32 %tmp35 to float
   br label %LOOP
 }
 
@@ -197,29 +201,19 @@ declare float @fabs(float) #2
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readonly }
-attributes #3 = { readnone }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
-
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
 
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #3
+declare float @llvm.amdgcn.rsq.f32(float) #1
 
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #3
+declare float @llvm.exp2.f32(float) #1
 
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
+; Function Attrs: nounwind readnone
+declare float @llvm.pow.f32(float, float) #1
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.SI.packf16(float, float) #1
@@ -233,114 +227,109 @@ declare i32 @llvm.SI.packf16(float, float) #1
 ; CHECK: image_sample
 ; CHECK: exp
 ; CHECK: s_endpgm
-define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-
+define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
-  %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
-  %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
-  %28 = fcmp oeq float %23, 0.0
-  br i1 %28, label %if, label %else
-
-if:
-  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 16)
+  %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+  %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0
+  %tmp25 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp26 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = fcmp oeq float %tmp22, 0.000000e+00
+  %tmp26.bc = bitcast <16 x i8> %tmp26 to <4 x i32>
+  br i1 %tmp27, label %if, label %else
+
+if:                                               ; preds = %entry
+  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %val.if.0 = extractelement <4 x float> %val.if, i32 0
   %val.if.1 = extractelement <4 x float> %val.if, i32 1
   %val.if.2 = extractelement <4 x float> %val.if, i32 2
   br label %endif
 
-else:
-  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+else:                                             ; preds = %entry
+  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1, i32 0>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %val.else.0 = extractelement <4 x float> %val.else, i32 0
   %val.else.1 = extractelement <4 x float> %val.else, i32 1
   %val.else.2 = extractelement <4 x float> %val.else, i32 2
   br label %endif
 
-endif:
-  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
-  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
-  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+endif:                                            ; preds = %else, %if
+  %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ]
+  %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ]
+  %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ]
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00)
   ret void
 }
 
-!2 = !{!"const", null, i32 1}
-
 ; CHECK-LABEL: {{^}}copy1:
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
 define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
-  %0 = load float, float addrspace(1)* %in0
-  %1 = fcmp oeq float %0, 0.0
-  br i1 %1, label %if0, label %endif
+  %tmp = load float, float addrspace(1)* %in0
+  %tmp1 = fcmp oeq float %tmp, 0.000000e+00
+  br i1 %tmp1, label %if0, label %endif
 
-if0:
-  %2 = bitcast float %0 to i32
-  %3 = fcmp olt float %0, 0.0
-  br i1 %3, label %if1, label %endif
+if0:                                              ; preds = %entry
+  %tmp2 = bitcast float %tmp to i32
+  %tmp3 = fcmp olt float %tmp, 0.000000e+00
+  br i1 %tmp3, label %if1, label %endif
 
-if1:
-  %4 = add i32 %2, 1
+if1:                                              ; preds = %if0
+  %tmp4 = add i32 %tmp2, 1
   br label %endif
 
-endif:
-  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
-  %6 = bitcast i32 %5 to float
-  store float %6, float addrspace(1)* %out
+endif:                                            ; preds = %if1, %if0, %entry
+  %tmp5 = phi i32 [ 0, %entry ], [ %tmp2, %if0 ], [ %tmp4, %if1 ]
+  %tmp6 = bitcast i32 %tmp5 to float
+  store float %tmp6, float addrspace(1)* %out
   ret void
 }
 
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-
-define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
   br label %LOOP68
 
-LOOP68:
+LOOP68:                                           ; preds = %ENDIF69, %entry
   %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
   %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
   %g = icmp eq i32 0, %t
   %l = bitcast float %temp4.7 to i32
   br i1 %g, label %IF70, label %ENDIF69
 
-IF70:
+IF70:                                             ; preds = %LOOP68
   %q = icmp ne i32 %l, 13
   %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
   ret void
 
-ENDIF69:
+ENDIF69:                                          ; preds = %LOOP68
   %u = add i32 %l, %t
   %v = bitcast i32 %u to float
   %x = add i32 %t, -1
   br label %LOOP68
 }
 
-attributes #0 = { "ShaderType"="0" }
-
 ; This test checks that image_sample resource descriptors aren't loaded into
 ; vgprs.  The verifier will fail if this happens.
 ; CHECK-LABEL:{{^}}sample_rsrc:
 ; CHECK: image_sample
 ; CHECK: image_sample
 ; CHECK: s_endpgm
-define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2
   %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
   %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !2
   %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !2
   %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
   %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
   %tmp31 = bitcast float %tmp23 to i32
@@ -352,9 +341,8 @@ bb38:                                             ; preds = %bb
   %tmp53 = bitcast float %tmp30 to i32
   %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
   %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
-  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
-  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
-  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+  %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32>
+  %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   br label %bb71
 
 bb80:                                             ; preds = %bb
@@ -363,9 +351,8 @@ bb80:                                             ; preds = %bb
   %tmp82.2 = add i32 %tmp82, 1
   %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
   %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
-  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
-  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
-  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+  %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32>
+  %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   br label %bb71
 
 bb71:                                             ; preds = %bb80, %bb38
@@ -375,5 +362,42 @@ bb71:                                             ; preds = %bb80, %bb38
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+; Check the the resource descriptor is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  ret void
+}
+
+; Check the the sampler is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+  %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
+!2 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/shared-op-cycle.ll b/test/CodeGen/AMDGPU/shared-op-cycle.ll
index f52a9baf4d18..f9a72b47cc99 100644
--- a/test/CodeGen/AMDGPU/shared-op-cycle.ll
+++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll
@@ -4,7 +4,7 @@
 ; CHECK: MULADD_IEEE *
 ; CHECK-NOT: MULADD_IEEE *
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
    %w0 = extractelement <4 x float> %reg0, i32 3
    %w1 = extractelement <4 x float> %reg1, i32 3
    %w2 = extractelement <4 x float> %reg2, i32 3
@@ -17,16 +17,15 @@ define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float>
    %v0 = insertelement <4 x float> undef, float %r0, i32 0
    %v1 = insertelement <4 x float> %v0, float %r1, i32 1
    %v2 = insertelement <4 x float> %v1, float %r2, i32 2
-   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %res = call float @llvm.r600.dot4(<4 x float> %v2, <4 x float> %v2)
    %vecres = insertelement <4 x float> undef, float %res, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+   call void @llvm.r600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
    ret void
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-\ No newline at end of file
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
new file mode 100644
index 000000000000..a6555a197388
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -0,0 +1,118 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Extract the high bit of the 1st quarter
+; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+
+; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i128, i128 addrspace(1)* %in.gep
+  %srl = lshr i128 %ld.64, 31
+  %bit = and i128 %srl, 1
+  store i128 %bit, i128 addrspace(1)* %out.gep
+  ret void
+}
+
+; Extract the high bit of the 2nd quarter
+; GCN-LABEL: {{^}}v_uextract_bit_63_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i128, i128 addrspace(1)* %in.gep
+  %srl = lshr i128 %ld.64, 63
+  %bit = and i128 %srl, 1
+  store i128 %bit, i128 addrspace(1)* %out.gep
+  ret void
+}
+
+; Extract the high bit of the 3rd quarter
+; GCN-LABEL: {{^}}v_uextract_bit_95_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i128, i128 addrspace(1)* %in.gep
+  %srl = lshr i128 %ld.64, 95
+  %bit = and i128 %srl, 1
+  store i128 %bit, i128 addrspace(1)* %out.gep
+  ret void
+}
+
+; Extract the high bit of the 4th quarter
+; GCN-LABEL: {{^}}v_uextract_bit_127_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i128, i128 addrspace(1)* %in.gep
+  %srl = lshr i128 %ld.64, 127
+  %bit = and i128 %srl, 1
+  store i128 %bit, i128 addrspace(1)* %out.gep
+  ret void
+}
+
+; Spans more than 2 dword boundaries
+; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128:
+; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+
+; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
+; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
+; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}}
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i128, i128 addrspace(1)* %in.gep
+  %srl = lshr i128 %ld.64, 34
+  %bit = and i128 %srl, 73786976294838206463
+  store i128 %bit, i128 addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
new file mode 100644
index 000000000000..c5dbfd9589a6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -0,0 +1,386 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
+
+; Extract the high bit of the low half
+; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 31
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; Extract the high bit of the high half
+; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 63
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 1
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 20
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 32
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 33
+  %bit = and i64 %srl, 1
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 20
+  %bit = and i64 %srl, 3
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 1
+  %bit = and i64 %srl, 1073741823
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 1
+  %bit = and i64 %srl, 2147483647
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; Spans the dword boundary, so requires full shift
+; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 31
+  %bit = and i64 %srl, 3
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 33
+  %bit = and i64 %srl, 3
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 30
+  %bit = and i64 %srl, 1073741823
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
+; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 33
+  %bit = and i64 %srl, 1073741823
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 31
+  %and = and i64 %srl, 4294967295
+  store i64 %and, i64 addrspace(1)* %out
+  ret void
+}
+
+; trunc applied before and mask
+; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN: buffer_store_dword v[[SHIFT]]
+define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 31
+  %trunc = trunc i64 %srl to i32
+  %bit = and i32 %trunc, 1
+  store i32 %bit, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
+; GCN: buffer_store_dword [[BFE]]
+define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 3
+  %trunc = trunc i64 %srl to i32
+  %bit = and i32 %trunc, 1
+  store i32 %bit, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
+; GCN: buffer_store_dword [[BFE]]
+define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 33
+  %trunc = trunc i64 %srl to i32
+  %bit = and i32 %trunc, 1
+  store i32 %bit, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
+; GCN-NOT: v[[SHRLO]]
+; GCN: buffer_store_dword v[[SHRLO]]
+define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 31
+  %trunc = trunc i64 %srl to i32
+  %bit = and i32 %trunc, 3
+  store i32 %bit, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_not_mask_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
+; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]]
+; GCN-NOT: v[[SHRLO]]
+; GCN-NOT: v[[SHRHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 20
+  %bit = and i64 %srl, 4
+  store i64 %bit, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; The instruction count is the same with/without hasOneUse, but
+; keeping the 32-bit and has a smaller encoding size than the bfe.
+
+; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 27
+  %bit = and i64 %srl, 3
+  store volatile i64 %srl, i64 addrspace(1)* %out
+  store volatile i64 %bit, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 34
+  %bit = and i64 %srl, 7
+  store volatile i64 %srl, i64 addrspace(1)* %out
+  store volatile i64 %bit, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dword v[[ZERO]]
+define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+  %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
+  %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x
+  %ld.64 = load i64, i64 addrspace(1)* %in.gep
+  %srl = lshr i64 %ld.64, 33
+  %bit = and i64 %srl, 7
+  store volatile i64 %bit, i64 addrspace(1)* %out0.gep
+
+  %srl.srl32 = lshr i64 %srl, 32
+  %srl.hi = trunc i64 %srl.srl32 to i32
+  store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
new file mode 100644
index 000000000000..28a7b924904d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
+
+
+; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0
+; GCN-LABEL: {{^}}lshr_i64_35:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = lshr i64 %val, 35
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_63:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = lshr i64 %val, 63
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_33:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = lshr i64 %val, 33
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_32:
+; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = lshr i64 %val, 32
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; Make sure the and of the constant doesn't prevent bfe from forming
+; after 64-bit shift is split.
+
+; GCN-LABEL: {{^}}lshr_and_i64_35:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
+  %shl = lshr i64 %and, 40
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; lshl (i64 x), c: c > 32 => reg_sequence lshl 0, (i32 lo_32(x)), (c - 32)
+
+; GCN-LABEL: {{^}}shl_i64_const_35:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 35
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_32:
+; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 32
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_63:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 63
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
+
+; GCN-LABEL: {{^}}ashr_i64_const_32:
+define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = ashr i64 %val, 32
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_i64_const_63:
+define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = ashr i64 %val, 63
+  store i64 %shl, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_31_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_15_i16_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
+; GCN: buffer_store_short [[SHL]]
+define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 15
+  %trunc = trunc i64 %shl to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_15_i16_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
+; GCN: buffer_store_short [[SHL]]
+define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32, i32 addrspace(1)* %in
+  %shl = shl i32 %val, 15
+  %trunc = trunc i32 %shl to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_7_i8_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
+; GCN: buffer_store_byte [[SHL]]
+define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 7
+  %trunc = trunc i64 %shl to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_1_i2_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
+; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
+; GCN: buffer_store_byte [[AND]]
+define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 1
+  %trunc = trunc i64 %shl to i2
+  store i2 %trunc, i2 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_1_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 1
+  %trunc = trunc i64 %shl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_16_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 16
+  %trunc = trunc i64 %shl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[ZERO]]
+define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 33
+  %trunc = trunc i64 %shl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_16_v2i32_v2i64:
+; GCN: buffer_load_dwordx4 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %shl = shl <2 x i64> %val, <i64 16, i64 16>
+  %trunc = trunc <2 x i64> %shl to <2 x i32>
+  store <2 x i32> %trunc, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_31_i32_i64_multi_use:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN: buffer_store_dword v[[RESLO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %val = load i64, i64 addrspace(1)* %in
+  %shl = shl i64 %val, 31
+  %trunc = trunc i64 %shl to i32
+  store volatile i32 %trunc, i32 addrspace(1)* %out
+  store volatile i64 %shl, i64 addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 55db80731c90..5a2b03bff990 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
@@ -208,4 +208,173 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_shl_constant_i64
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %shl = shl i64 281474976710655, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_constant_i64:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0xab19b207
+; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
+; SI: buffer_store_dwordx2
+define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %shl = shl i64 1231231234567, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_i64_32_bit_constant:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
+; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
+define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %shl = shl i64 1234567, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
+define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %shl = shl i64 64, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
+define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 64, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
+define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 1, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 4607182418800017408, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 13830554455654793216, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
+define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 4602678819172646912, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 13826050856027422720, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 4611686018427387904, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 13835058055282163712, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 4616189618054758400, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 13839561654909534208, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 1082130432, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_neg_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 -1065353216, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 4647714815446351872, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_neg_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %shl = shl i64 13871086852301127680, %a
+  store i64 %shl, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
index dfb2bf3383fc..13254d0bcf74 100644
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Test with inline immediate
 
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
   %add = add i32 %val, 9
@@ -26,7 +26,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; SI-DAG: buffer_store_dword [[SHLREG]]
 ; SI: s_endpgm
 define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
   %add = add i32 %val, 9
@@ -44,7 +44,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
   %shl = add i32 %val, 999
@@ -57,8 +57,8 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0
 ; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
 define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
@@ -73,7 +73,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #
 ; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]]
 ; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index ac94824bd61f..a6be2eda33b3 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -7,7 +7,7 @@
 ; LDS globals.
 
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 @lds0 = addrspace(3) global [512 x float] undef, align 4
 @lds1 = addrspace(3) global [512 x float] undef, align 4
@@ -20,7 +20,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
 define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -40,7 +40,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
 define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -56,7 +56,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
 ; SI: s_endpgm
 define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
   %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
   %val0 = load i8, i8 addrspace(3)* %arrayidx0
@@ -74,7 +74,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
 define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -90,7 +90,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
   store float 1.0, float addrspace(3)* %arrayidx0, align 4
@@ -105,7 +105,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 @lds2 = addrspace(3) global [512 x i32] undef, align 4
 
 ; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
 ;   %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
@@ -120,7 +120,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
@@ -135,7 +135,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -149,7 +149,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -163,7 +163,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -177,7 +177,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -191,7 +191,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -205,7 +205,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -215,7 +215,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 }
 
 ; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
 ;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -229,7 +229,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -243,7 +243,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -257,7 +257,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -271,7 +271,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
deleted file mode 100644
index 69d719385acd..000000000000
--- a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-
-
-define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: {{^}}test:
-
-entry:
-  switch i32 %x, label %sw.default [
-    i32 0, label %sw.bb
-    i32 60, label %sw.bb
-  ]
-
-sw.bb:
-  unreachable
-
-sw.default:
-  unreachable
-
-sw.epilog:
-  ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index bbcb861f37dc..133fd480e599 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,9 +10,10 @@
 ; SI: s_andn2_b64
 ; s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
 main_body:
-  %0 = and i32 %a, %b
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %0 = and i32 %a, %tid
   %1 = trunc i32 %0 to i1
   br label %ENDIF
 
@@ -39,9 +40,10 @@ ENDIF:
 ; SI: s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
 
-define void @phi_cond_outside_loop(i32 %a, i32 %b) {
+define void @phi_cond_outside_loop(i32 %b) {
 entry:
-  %0 = icmp eq i32 %a , 0
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %0 = icmp eq i32 %tid , 0
   br i1 %0, label %if, label %else
 
 if:
@@ -61,3 +63,88 @@ loop:
 exit:
   ret void
 }
+
+; FIXME: should emit s_endpgm
+; CHECK-LABEL: {{^}}switch_unreachable:
+; CHECK-NOT: s_endpgm
+; CHECK: .Lfunc_end2
+define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+centry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; This broke the old AMDIL cfg structurizer
+; FUNC-LABEL: {{^}}loop_land_info_assert:
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]]
+
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]
+
+; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
+; SI: s_branch [[INFLOOP]]
+
+; SI: [[ENDPGM]]:
+; SI: s_endpgm
+define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+entry:
+  %cmp = icmp sgt i32 %c0, 0
+  br label %while.cond.outer
+
+while.cond.outer:
+  %tmp = load float, float addrspace(1)* undef
+  br label %while.cond
+
+while.cond:
+  %cmp1 = icmp slt i32 %c1, 4
+  br i1 %cmp1, label %convex.exit, label %for.cond
+
+convex.exit:
+  %or = or i1 %cmp, %cmp1
+  br i1 %or, label %return, label %if.end
+
+if.end:
+  %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone
+  %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000
+  br i1 %cmp2, label %if.else, label %while.cond.outer
+
+if.else:
+  store volatile i32 3, i32 addrspace(1)* undef, align 4
+  br label %while.cond
+
+for.cond:
+  %cmp3 = icmp slt i32 %c3, 1000
+  br i1 %cmp3, label %for.body, label %return
+
+for.body:
+  br i1 %cmp3, label %self.loop, label %if.end.2
+
+if.end.2:
+  %or.cond2 = or i1 %cmp3, %arg
+  br i1 %or.cond2, label %return, label %for.cond
+
+self.loop:
+ br label %self.loop
+
+return:
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
new file mode 100644
index 000000000000..025a3d8fca2e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test:
+; CHECK s_and_saveexec_b64
+; CHECK s_xor_b64
+; CHECK s_or_b64 exec, exec
+; CHECK s_andn2_b64 exec, exec
+; CHECK s_cbranch_execnz
+define void @test(i32 %arg, i32 %arg1) {
+bb:
+  %tmp = icmp ne i32 %arg, 0
+  %tmp7 = icmp ne i32 %arg1, 0
+  %tmp8 = and i1 %tmp, %tmp7
+  br i1 %tmp8, label %bb9, label %bb11
+
+bb9:                                              ; preds = %bb
+  br label %bb10
+
+bb10:                                             ; preds = %bb10, %bb9
+  br label %bb10
+
+bb11:                                             ; preds = %bb
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 27a8e70aae13..98d1bb7cf9a2 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -1,15 +1,15 @@
-; RUN: llc -o /dev/null %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos 2>&1 | FileCheck %s
+; RUN: llc -o - %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos | FileCheck %s
 ; This test verifies that the instruction selection will add the implicit
 ; register operands in the correct order when modifying the opcode of an
 ; instruction to V_ADD_I32_e32.
 
-; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
+; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 killed %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %a = load volatile i32, i32 addrspace(1)* %in
+  %b = load volatile i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll
index 901b3c3453fc..d5030adc89be 100644
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ b/test/CodeGen/AMDGPU/si-literal-folding.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8
 
-define void @main(float) #0 {
+define amdgpu_vs void @main(float) {
 main_body:
   %1 = fmul float %0, 0x3FE86A7F00000000
   %2 = fmul float %0, 0xBFE86A7F00000000
@@ -13,5 +13,3 @@ main_body:
 }
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index 944499a11461..8df0a64a2b7c 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -5,32 +5,32 @@
 ; the wrong register class is used for the REG_SEQUENCE instructions.
 
 ; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
-  %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
-  %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
-  %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
-  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
-  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
-  %29 = bitcast float %22 to i32
-  %30 = bitcast float %27 to i32
-  %31 = bitcast float %28 to i32
-  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
-  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
-  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
-  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
-  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
-  %37 = extractelement <4 x float> %36, i32 0
-  %38 = extractelement <4 x float> %36, i32 1
-  %39 = extractelement <4 x float> %36, i32 2
-  %40 = extractelement <4 x float> %36, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
+  %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
+  %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
+  %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0
+  %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %tmp27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %tmp28 = bitcast float %tmp21 to i32
+  %tmp29 = bitcast float %tmp26 to i32
+  %tmp30 = bitcast float %tmp27 to i32
+  %tmp31 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
+  %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1
+  %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2
+  %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3
+  %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32>
+  %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp36 = extractelement <4 x float> %tmp35, i32 0
+  %tmp37 = extractelement <4 x float> %tmp35, i32 1
+  %tmp38 = extractelement <4 x float> %tmp35, i32 2
+  %tmp39 = extractelement <4 x float> %tmp35, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39)
   ret void
 }
 
@@ -40,13 +40,13 @@ declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
+
 attributes #1 = { nounwind readnone }
 
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
new file mode 100644
index 000000000000..ea506e6b3b3f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
+; GCN: v_cmp_eq_i32
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+; GCN: s_branch BB0_1
+
+; GCN: s_or_b64 exec, exec
+; GCN: s_endpgm
+
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define void @lower_control_flow_unreachable_terminator() #0 {
+bb:
+  %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tmp63 = icmp eq i32 %tmp15, 32
+  br i1 %tmp63, label %bb64, label %bb68
+
+bb64:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+
+bb68:
+  ret void
+}
+
+; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
+; GCN: v_cmp_eq_i32
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+; GCN: s_endpgm
+
+; GCN: s_or_b64 exec, exec
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+bb:
+  %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tmp63 = icmp eq i32 %tmp15, 32
+  br i1 %tmp63, label %bb68, label %bb64
+
+bb68:
+  ret void
+
+bb64:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll
index 66a9571d75bf..5520fe61d867 100644
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -1,4 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s
+; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so
+; we need to disable this when the si scheduler is being used.
+; The only way the subtarget knows that the si machine scheduler is being used
+; is to specify -mattr=si-scheduler.  If we just pass --misched=si, the backend
+; won't know what scheduler we are using.
+; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
 
 ; The test checks the "si" machine scheduler pass works correctly.
 
@@ -11,45 +16,45 @@
 ; CHECK: s_waitcnt vmcnt(0)
 ; CHECK: exp
 ; CHECK: s_endpgm
-
-define void @main([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, 
-<2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 main_body:
-  %22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
-  %23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
-  %24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
-  %25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
-  %26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
-  %27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
-  %28 = bitcast float %26 to i32
-  %29 = bitcast float %27 to i32
-  %30 = insertelement <2 x i32> undef, i32 %28, i32 0
-  %31 = insertelement <2 x i32> %30, i32 %29, i32 1
-  %32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
-  %33 = extractelement <4 x float> %32, i32 0
-  %34 = extractelement <4 x float> %32, i32 1
-  %35 = extractelement <4 x float> %32, i32 2
-  %36 = extractelement <4 x float> %32, i32 3
-  %37 = call i32 @llvm.SI.packf16(float %33, float %34)
-  %38 = bitcast i32 %37 to float
-  %39 = call i32 @llvm.SI.packf16(float %35, float %36)
-  %40 = bitcast i32 %39 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %38, float %40, float %38, float %40)
+  %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
+  %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
+  %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)*
+  %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0
+  %tmp25 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg11)
+  %tmp26 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg11)
+  %tmp27 = bitcast float %tmp25 to i32
+  %tmp28 = bitcast float %tmp26 to i32
+  %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0
+  %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1
+  %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>
+  %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>
+  %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp32 = extractelement <4 x float> %tmp31, i32 0
+  %tmp33 = extractelement <4 x float> %tmp31, i32 1
+  %tmp34 = extractelement <4 x float> %tmp31, i32 2
+  %tmp35 = extractelement <4 x float> %tmp31, i32 3
+  %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
+  %tmp37 = bitcast i32 %tmp36 to float
+  %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
+  %tmp39 = bitcast i32 %tmp38 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
   ret void
 }
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.SI.packf16(float, float) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
 attributes #1 = { nounwind readnone }
 
-!0 = !{!"const", null, i32 1}
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index d7b35fc631eb..63b1b71a8cb7 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,9 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
+
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_wqm
 
@@ -19,1560 +22,1601 @@
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
-define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
-  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
-  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
-  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
-  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
-  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
-  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
-  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
-  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
-  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
-  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
-  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
-  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
-  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
-  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
-  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
-  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
-  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
-  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
-  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
-  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
-  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
-  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
-  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
-  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
-  %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
-  %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
-  %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
-  %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
-  %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
-  %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
-  %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
-  %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
-  %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
-  %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
-  %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
-  %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
-  %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
-  %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
-  %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
-  %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
-  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
-  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
-  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
-  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
-  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
-  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
-  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
-  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
-  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
-  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
-  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
-  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
-  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
-  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
-  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
-  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
-  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
-  %110 = call i32 @llvm.SI.tid()
-  %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
-  %112 = bitcast float %93 to i32
-  store i32 %112, i32 addrspace(3)* %111
-  %113 = bitcast float %94 to i32
-  store i32 %113, i32 addrspace(3)* %111
-  %114 = call i32 @llvm.SI.tid()
-  %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
-  %116 = and i32 %114, -4
-  %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
-  %118 = add i32 %116, 1
-  %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
-  %120 = bitcast float %93 to i32
-  store i32 %120, i32 addrspace(3)* %115
-  %121 = load i32, i32 addrspace(3)* %117
-  %122 = bitcast i32 %121 to float
-  %123 = load i32, i32 addrspace(3)* %119
-  %124 = bitcast i32 %123 to float
-  %125 = fsub float %124, %122
-  %126 = bitcast float %94 to i32
-  store i32 %126, i32 addrspace(3)* %115
-  %127 = load i32, i32 addrspace(3)* %117
-  %128 = bitcast i32 %127 to float
-  %129 = load i32, i32 addrspace(3)* %119
-  %130 = bitcast i32 %129 to float
-  %131 = fsub float %130, %128
-  %132 = insertelement <4 x float> undef, float %125, i32 0
-  %133 = insertelement <4 x float> %132, float %131, i32 1
-  %134 = insertelement <4 x float> %133, float %131, i32 2
-  %135 = insertelement <4 x float> %134, float %131, i32 3
-  %136 = extractelement <4 x float> %135, i32 0
-  %137 = extractelement <4 x float> %135, i32 1
-  %138 = fmul float %60, %93
-  %139 = fmul float %60, %94
-  %140 = fmul float %60, %94
-  %141 = fmul float %60, %94
-  %142 = call i32 @llvm.SI.tid()
-  %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
-  %144 = bitcast float %138 to i32
-  store i32 %144, i32 addrspace(3)* %143
-  %145 = bitcast float %139 to i32
-  store i32 %145, i32 addrspace(3)* %143
-  %146 = bitcast float %140 to i32
-  store i32 %146, i32 addrspace(3)* %143
-  %147 = bitcast float %141 to i32
-  store i32 %147, i32 addrspace(3)* %143
-  %148 = call i32 @llvm.SI.tid()
-  %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
-  %150 = and i32 %148, -4
-  %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
-  %152 = add i32 %150, 2
-  %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
-  %154 = bitcast float %138 to i32
-  store i32 %154, i32 addrspace(3)* %149
-  %155 = load i32, i32 addrspace(3)* %151
-  %156 = bitcast i32 %155 to float
-  %157 = load i32, i32 addrspace(3)* %153
-  %158 = bitcast i32 %157 to float
-  %159 = fsub float %158, %156
-  %160 = bitcast float %139 to i32
-  store i32 %160, i32 addrspace(3)* %149
-  %161 = load i32, i32 addrspace(3)* %151
-  %162 = bitcast i32 %161 to float
-  %163 = load i32, i32 addrspace(3)* %153
-  %164 = bitcast i32 %163 to float
-  %165 = fsub float %164, %162
-  %166 = bitcast float %140 to i32
-  store i32 %166, i32 addrspace(3)* %149
-  %167 = load i32, i32 addrspace(3)* %151
-  %168 = bitcast i32 %167 to float
-  %169 = load i32, i32 addrspace(3)* %153
-  %170 = bitcast i32 %169 to float
-  %171 = fsub float %170, %168
-  %172 = bitcast float %141 to i32
-  store i32 %172, i32 addrspace(3)* %149
-  %173 = load i32, i32 addrspace(3)* %151
-  %174 = bitcast i32 %173 to float
-  %175 = load i32, i32 addrspace(3)* %153
-  %176 = bitcast i32 %175 to float
-  %177 = fsub float %176, %174
-  %178 = insertelement <4 x float> undef, float %159, i32 0
-  %179 = insertelement <4 x float> %178, float %165, i32 1
-  %180 = insertelement <4 x float> %179, float %171, i32 2
-  %181 = insertelement <4 x float> %180, float %177, i32 3
-  %182 = extractelement <4 x float> %181, i32 0
-  %183 = extractelement <4 x float> %181, i32 1
-  %184 = fdiv float 1.000000e+00, %97
-  %185 = fmul float %33, %184
-  %186 = fcmp uge float 1.000000e+00, %185
-  %187 = select i1 %186, float %185, float 1.000000e+00
-  %188 = fmul float %187, %30
-  %189 = call float @ceil(float %188)
-  %190 = fcmp uge float 3.000000e+00, %189
-  %191 = select i1 %190, float 3.000000e+00, float %189
-  %192 = fdiv float 1.000000e+00, %191
-  %193 = fdiv float 1.000000e+00, %30
-  %194 = fmul float %191, %193
-  %195 = fmul float %31, %194
-  %196 = fmul float %95, %95
-  %197 = fmul float %96, %96
-  %198 = fadd float %197, %196
-  %199 = fmul float %97, %97
-  %200 = fadd float %198, %199
-  %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
-  %202 = fmul float %95, %201
-  %203 = fmul float %96, %201
-  %204 = fmul float %202, %29
-  %205 = fmul float %203, %29
-  %206 = fmul float %204, -1.000000e+00
-  %207 = fmul float %205, 1.000000e+00
-  %208 = fmul float %206, %32
-  %209 = fmul float %207, %32
-  %210 = fsub float -0.000000e+00, %208
-  %211 = fadd float %93, %210
-  %212 = fsub float -0.000000e+00, %209
-  %213 = fadd float %94, %212
-  %214 = fmul float %206, %192
-  %215 = fmul float %207, %192
-  %216 = fmul float -1.000000e+00, %192
-  %217 = bitcast float %136 to i32
-  %218 = bitcast float %182 to i32
-  %219 = bitcast float %137 to i32
-  %220 = bitcast float %183 to i32
-  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
-  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
-  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
-  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 96)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 100)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 104)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 112)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 116)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 120)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 140)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
+  %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
+  %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
+  %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
+  %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
+  %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
+  %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
+  %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 224)
+  %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
+  %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
+  %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
+  %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
+  %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
+  %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
+  %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
+  %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
+  %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
+  %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 296)
+  %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 304)
+  %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 308)
+  %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 312)
+  %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 368)
+  %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 372)
+  %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 376)
+  %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 384)
+  %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+  %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0
+  %tmp62 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp63 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp62, !tbaa !0
+  %tmp63.bc = bitcast <16 x i8> %tmp63 to <4 x i32>
+  %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
+  %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0
+  %tmp66 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
+  %tmp67 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp66, !tbaa !0
+  %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
+  %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0
+  %tmp70 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
+  %tmp71 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp70, !tbaa !0
+  %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
+  %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0
+  %tmp74 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
+  %tmp75 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp74, !tbaa !0
+  %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
+  %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0
+  %tmp78 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
+  %tmp79 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp78, !tbaa !0
+  %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
+  %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0
+  %tmp82 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
+  %tmp83 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp82, !tbaa !0
+  %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
+  %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0
+  %tmp86 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
+  %tmp87 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp86, !tbaa !0
+  %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
+  %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
+  %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
+  %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0
+  %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp93 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp94 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp95 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp96 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp97 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp98 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp99 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp100 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp101 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp102 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp103 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp104 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp105 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp106 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp107 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp108 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
+  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
+  %tmp111 = bitcast float %tmp92 to i32
+  store i32 %tmp111, i32 addrspace(3)* %tmp110
+  %tmp112 = bitcast float %tmp93 to i32
+  store i32 %tmp112, i32 addrspace(3)* %tmp110
+  %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
+  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
+  %tmp115 = and i32 %tmp113, -4
+  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
+  %tmp117 = add i32 %tmp115, 1
+  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
+  %tmp119 = bitcast float %tmp92 to i32
+  store i32 %tmp119, i32 addrspace(3)* %tmp114
+  %tmp120 = load i32, i32 addrspace(3)* %tmp116
+  %tmp121 = bitcast i32 %tmp120 to float
+  %tmp122 = load i32, i32 addrspace(3)* %tmp118
+  %tmp123 = bitcast i32 %tmp122 to float
+  %tmp124 = fsub float %tmp123, %tmp121
+  %tmp125 = bitcast float %tmp93 to i32
+  store i32 %tmp125, i32 addrspace(3)* %tmp114
+  %tmp126 = load i32, i32 addrspace(3)* %tmp116
+  %tmp127 = bitcast i32 %tmp126 to float
+  %tmp128 = load i32, i32 addrspace(3)* %tmp118
+  %tmp129 = bitcast i32 %tmp128 to float
+  %tmp130 = fsub float %tmp129, %tmp127
+  %tmp131 = insertelement <4 x float> undef, float %tmp124, i32 0
+  %tmp132 = insertelement <4 x float> %tmp131, float %tmp130, i32 1
+  %tmp133 = insertelement <4 x float> %tmp132, float %tmp130, i32 2
+  %tmp134 = insertelement <4 x float> %tmp133, float %tmp130, i32 3
+  %tmp135 = extractelement <4 x float> %tmp134, i32 0
+  %tmp136 = extractelement <4 x float> %tmp134, i32 1
+  %tmp137 = fmul float %tmp59, %tmp92
+  %tmp138 = fmul float %tmp59, %tmp93
+  %tmp139 = fmul float %tmp59, %tmp93
+  %tmp140 = fmul float %tmp59, %tmp93
+  %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
+  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
+  %tmp143 = bitcast float %tmp137 to i32
+  store i32 %tmp143, i32 addrspace(3)* %tmp142
+  %tmp144 = bitcast float %tmp138 to i32
+  store i32 %tmp144, i32 addrspace(3)* %tmp142
+  %tmp145 = bitcast float %tmp139 to i32
+  store i32 %tmp145, i32 addrspace(3)* %tmp142
+  %tmp146 = bitcast float %tmp140 to i32
+  store i32 %tmp146, i32 addrspace(3)* %tmp142
+  %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
+  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
+  %tmp149 = and i32 %tmp147, -4
+  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
+  %tmp151 = add i32 %tmp149, 2
+  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
+  %tmp153 = bitcast float %tmp137 to i32
+  store i32 %tmp153, i32 addrspace(3)* %tmp148
+  %tmp154 = load i32, i32 addrspace(3)* %tmp150
+  %tmp155 = bitcast i32 %tmp154 to float
+  %tmp156 = load i32, i32 addrspace(3)* %tmp152
+  %tmp157 = bitcast i32 %tmp156 to float
+  %tmp158 = fsub float %tmp157, %tmp155
+  %tmp159 = bitcast float %tmp138 to i32
+  store i32 %tmp159, i32 addrspace(3)* %tmp148
+  %tmp160 = load i32, i32 addrspace(3)* %tmp150
+  %tmp161 = bitcast i32 %tmp160 to float
+  %tmp162 = load i32, i32 addrspace(3)* %tmp152
+  %tmp163 = bitcast i32 %tmp162 to float
+  %tmp164 = fsub float %tmp163, %tmp161
+  %tmp165 = bitcast float %tmp139 to i32
+  store i32 %tmp165, i32 addrspace(3)* %tmp148
+  %tmp166 = load i32, i32 addrspace(3)* %tmp150
+  %tmp167 = bitcast i32 %tmp166 to float
+  %tmp168 = load i32, i32 addrspace(3)* %tmp152
+  %tmp169 = bitcast i32 %tmp168 to float
+  %tmp170 = fsub float %tmp169, %tmp167
+  %tmp171 = bitcast float %tmp140 to i32
+  store i32 %tmp171, i32 addrspace(3)* %tmp148
+  %tmp172 = load i32, i32 addrspace(3)* %tmp150
+  %tmp173 = bitcast i32 %tmp172 to float
+  %tmp174 = load i32, i32 addrspace(3)* %tmp152
+  %tmp175 = bitcast i32 %tmp174 to float
+  %tmp176 = fsub float %tmp175, %tmp173
+  %tmp177 = insertelement <4 x float> undef, float %tmp158, i32 0
+  %tmp178 = insertelement <4 x float> %tmp177, float %tmp164, i32 1
+  %tmp179 = insertelement <4 x float> %tmp178, float %tmp170, i32 2
+  %tmp180 = insertelement <4 x float> %tmp179, float %tmp176, i32 3
+  %tmp181 = extractelement <4 x float> %tmp180, i32 0
+  %tmp182 = extractelement <4 x float> %tmp180, i32 1
+  %tmp183 = fdiv float 1.000000e+00, %tmp96
+  %tmp184 = fmul float %tmp32, %tmp183
+  %tmp185 = fcmp uge float 1.000000e+00, %tmp184
+  %tmp186 = select i1 %tmp185, float %tmp184, float 1.000000e+00
+  %tmp187 = fmul float %tmp186, %tmp29
+  %tmp188 = call float @ceil(float %tmp187)
+  %tmp189 = fcmp uge float 3.000000e+00, %tmp188
+  %tmp190 = select i1 %tmp189, float 3.000000e+00, float %tmp188
+  %tmp191 = fdiv float 1.000000e+00, %tmp190
+  %tmp192 = fdiv float 1.000000e+00, %tmp29
+  %tmp193 = fmul float %tmp190, %tmp192
+  %tmp194 = fmul float %tmp30, %tmp193
+  %tmp195 = fmul float %tmp94, %tmp94
+  %tmp196 = fmul float %tmp95, %tmp95
+  %tmp197 = fadd float %tmp196, %tmp195
+  %tmp198 = fmul float %tmp96, %tmp96
+  %tmp199 = fadd float %tmp197, %tmp198
+  %tmp200 = call float @llvm.amdgcn.rsq.f32(float %tmp199)
+  %tmp201 = fmul float %tmp94, %tmp200
+  %tmp202 = fmul float %tmp95, %tmp200
+  %tmp203 = fmul float %tmp201, %tmp28
+  %tmp204 = fmul float %tmp202, %tmp28
+  %tmp205 = fmul float %tmp203, -1.000000e+00
+  %tmp206 = fmul float %tmp204, 1.000000e+00
+  %tmp207 = fmul float %tmp205, %tmp31
+  %tmp208 = fmul float %tmp206, %tmp31
+  %tmp209 = fsub float -0.000000e+00, %tmp207
+  %tmp210 = fadd float %tmp92, %tmp209
+  %tmp211 = fsub float -0.000000e+00, %tmp208
+  %tmp212 = fadd float %tmp93, %tmp211
+  %tmp213 = fmul float %tmp205, %tmp191
+  %tmp214 = fmul float %tmp206, %tmp191
+  %tmp215 = fmul float -1.000000e+00, %tmp191
+  %tmp216 = bitcast float %tmp135 to i32
+  %tmp217 = bitcast float %tmp181 to i32
+  %tmp218 = bitcast float %tmp136 to i32
+  %tmp219 = bitcast float %tmp182 to i32
+  %tmp220 = insertelement <8 x i32> undef, i32 %tmp216, i32 0
+  %tmp221 = insertelement <8 x i32> %tmp220, i32 %tmp217, i32 1
+  %tmp222 = insertelement <8 x i32> %tmp221, i32 %tmp218, i32 2
+  %tmp223 = insertelement <8 x i32> %tmp222, i32 %tmp219, i32 3
   br label %LOOP
 
 LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
-  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
-  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
-  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
-  %225 = fcmp oge float %temp24.0, %191
-  %226 = sext i1 %225 to i32
-  %227 = bitcast i32 %226 to float
-  %228 = bitcast float %227 to i32
-  %229 = icmp ne i32 %228, 0
-  br i1 %229, label %IF, label %ENDIF
+  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp257, %ENDIF ]
+  %temp28.0 = phi float [ %tmp210, %main_body ], [ %tmp252, %ENDIF ]
+  %temp29.0 = phi float [ %tmp212, %main_body ], [ %tmp254, %ENDIF ]
+  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp256, %ENDIF ]
+  %tmp224 = fcmp oge float %temp24.0, %tmp190
+  %tmp225 = sext i1 %tmp224 to i32
+  %tmp226 = bitcast i32 %tmp225 to float
+  %tmp227 = bitcast float %tmp226 to i32
+  %tmp228 = icmp ne i32 %tmp227, 0
+  br i1 %tmp228, label %IF, label %ENDIF
 
 IF:                                               ; preds = %LOOP
-  %230 = bitcast float %136 to i32
-  %231 = bitcast float %182 to i32
-  %232 = bitcast float %137 to i32
-  %233 = bitcast float %183 to i32
-  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
-  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
-  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
-  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+  %tmp229 = bitcast float %tmp135 to i32
+  %tmp230 = bitcast float %tmp181 to i32
+  %tmp231 = bitcast float %tmp136 to i32
+  %tmp232 = bitcast float %tmp182 to i32
+  %tmp233 = insertelement <8 x i32> undef, i32 %tmp229, i32 0
+  %tmp234 = insertelement <8 x i32> %tmp233, i32 %tmp230, i32 1
+  %tmp235 = insertelement <8 x i32> %tmp234, i32 %tmp231, i32 2
+  %tmp236 = insertelement <8 x i32> %tmp235, i32 %tmp232, i32 3
   br label %LOOP65
 
 ENDIF:                                            ; preds = %LOOP
-  %238 = bitcast float %temp28.0 to i32
-  %239 = bitcast float %temp29.0 to i32
-  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
-  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
-  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
-  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
-  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %245 = extractelement <4 x float> %244, i32 3
-  %246 = fcmp oge float %temp30.0, %245
-  %247 = sext i1 %246 to i32
-  %248 = bitcast i32 %247 to float
-  %249 = bitcast float %248 to i32
-  %250 = and i32 %249, 1065353216
-  %251 = bitcast i32 %250 to float
-  %252 = fmul float %214, %251
-  %253 = fadd float %252, %temp28.0
-  %254 = fmul float %215, %251
-  %255 = fadd float %254, %temp29.0
-  %256 = fmul float %216, %251
-  %257 = fadd float %256, %temp30.0
-  %258 = fadd float %temp24.0, 1.000000e+00
+  %tmp237 = bitcast float %temp28.0 to i32
+  %tmp238 = bitcast float %temp29.0 to i32
+  %tmp239 = insertelement <8 x i32> %tmp223, i32 %tmp237, i32 4
+  %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5
+  %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6
+  %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7
+  %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp244 = extractelement <4 x float> %tmp243, i32 3
+  %tmp245 = fcmp oge float %temp30.0, %tmp244
+  %tmp246 = sext i1 %tmp245 to i32
+  %tmp247 = bitcast i32 %tmp246 to float
+  %tmp248 = bitcast float %tmp247 to i32
+  %tmp249 = and i32 %tmp248, 1065353216
+  %tmp250 = bitcast i32 %tmp249 to float
+  %tmp251 = fmul float %tmp213, %tmp250
+  %tmp252 = fadd float %tmp251, %temp28.0
+  %tmp253 = fmul float %tmp214, %tmp250
+  %tmp254 = fadd float %tmp253, %temp29.0
+  %tmp255 = fmul float %tmp215, %tmp250
+  %tmp256 = fadd float %tmp255, %temp30.0
+  %tmp257 = fadd float %temp24.0, 1.000000e+00
   br label %LOOP
 
 LOOP65:                                           ; preds = %ENDIF66, %IF
-  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
-  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
-  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
-  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
-  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
-  %259 = fcmp oge float %temp24.1, %195
-  %260 = sext i1 %259 to i32
-  %261 = bitcast i32 %260 to float
-  %262 = bitcast float %261 to i32
-  %263 = icmp ne i32 %262, 0
-  br i1 %263, label %IF67, label %ENDIF66
+  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %tmp609, %ENDIF66 ]
+  %temp28.1 = phi float [ %temp28.0, %IF ], [ %tmp604, %ENDIF66 ]
+  %temp29.1 = phi float [ %temp29.0, %IF ], [ %tmp606, %ENDIF66 ]
+  %temp30.1 = phi float [ %temp30.0, %IF ], [ %tmp608, %ENDIF66 ]
+  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %tmp610, %ENDIF66 ]
+  %tmp258 = fcmp oge float %temp24.1, %tmp194
+  %tmp259 = sext i1 %tmp258 to i32
+  %tmp260 = bitcast i32 %tmp259 to float
+  %tmp261 = bitcast float %tmp260 to i32
+  %tmp262 = icmp ne i32 %tmp261, 0
+  br i1 %tmp262, label %IF67, label %ENDIF66
 
 IF67:                                             ; preds = %LOOP65
-  %264 = bitcast float %136 to i32
-  %265 = bitcast float %182 to i32
-  %266 = bitcast float %137 to i32
-  %267 = bitcast float %183 to i32
-  %268 = bitcast float %temp28.1 to i32
-  %269 = bitcast float %temp29.1 to i32
-  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
-  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
-  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
-  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
-  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
-  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
-  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
-  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
-  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
-  %279 = extractelement <4 x float> %278, i32 0
-  %280 = extractelement <4 x float> %278, i32 1
-  %281 = extractelement <4 x float> %278, i32 2
-  %282 = extractelement <4 x float> %278, i32 3
-  %283 = fmul float %282, %47
-  %284 = bitcast float %136 to i32
-  %285 = bitcast float %182 to i32
-  %286 = bitcast float %137 to i32
-  %287 = bitcast float %183 to i32
-  %288 = bitcast float %temp28.1 to i32
-  %289 = bitcast float %temp29.1 to i32
-  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
-  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
-  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
-  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
-  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
-  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
-  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
-  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
-  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
-  %299 = extractelement <4 x float> %298, i32 0
-  %300 = extractelement <4 x float> %298, i32 1
-  %301 = extractelement <4 x float> %298, i32 2
-  %302 = bitcast float %136 to i32
-  %303 = bitcast float %182 to i32
-  %304 = bitcast float %137 to i32
-  %305 = bitcast float %183 to i32
-  %306 = bitcast float %temp28.1 to i32
-  %307 = bitcast float %temp29.1 to i32
-  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
-  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
-  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
-  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
-  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
-  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
-  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
-  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
-  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
-  %317 = extractelement <4 x float> %316, i32 0
-  %318 = extractelement <4 x float> %316, i32 1
-  %319 = extractelement <4 x float> %316, i32 2
-  %320 = fmul float %317, %23
-  %321 = fmul float %318, %24
-  %322 = fmul float %319, %25
-  %323 = fmul float %299, %26
-  %324 = fadd float %323, %320
-  %325 = fmul float %300, %27
-  %326 = fadd float %325, %321
-  %327 = fmul float %301, %28
-  %328 = fadd float %327, %322
-  %329 = fadd float %279, %324
-  %330 = fadd float %280, %326
-  %331 = fadd float %281, %328
-  %332 = bitcast float %136 to i32
-  %333 = bitcast float %182 to i32
-  %334 = bitcast float %137 to i32
-  %335 = bitcast float %183 to i32
-  %336 = bitcast float %temp28.1 to i32
-  %337 = bitcast float %temp29.1 to i32
-  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
-  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
-  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
-  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
-  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
-  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
-  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
-  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
-  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %347 = extractelement <4 x float> %346, i32 0
-  %348 = extractelement <4 x float> %346, i32 1
-  %349 = extractelement <4 x float> %346, i32 2
-  %350 = fadd float %347, -5.000000e-01
-  %351 = fadd float %348, -5.000000e-01
-  %352 = fadd float %349, -5.000000e-01
-  %353 = fmul float %350, %350
-  %354 = fmul float %351, %351
-  %355 = fadd float %354, %353
-  %356 = fmul float %352, %352
-  %357 = fadd float %355, %356
-  %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
-  %359 = fmul float %350, %358
-  %360 = fmul float %351, %358
-  %361 = fmul float %352, %358
-  %362 = bitcast float %136 to i32
-  %363 = bitcast float %182 to i32
-  %364 = bitcast float %137 to i32
-  %365 = bitcast float %183 to i32
-  %366 = bitcast float %temp28.1 to i32
-  %367 = bitcast float %temp29.1 to i32
-  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
-  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
-  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
-  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
-  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
-  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
-  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
-  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
-  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
-  %377 = extractelement <4 x float> %376, i32 0
-  %378 = extractelement <4 x float> %376, i32 1
-  %379 = extractelement <4 x float> %376, i32 2
-  %380 = extractelement <4 x float> %376, i32 3
-  %381 = fsub float -0.000000e+00, %95
-  %382 = fsub float -0.000000e+00, %96
-  %383 = fsub float -0.000000e+00, %97
-  %384 = fmul float %359, %381
-  %385 = fmul float %360, %382
-  %386 = fadd float %385, %384
-  %387 = fmul float %361, %383
-  %388 = fadd float %386, %387
-  %389 = fmul float %388, %359
-  %390 = fmul float %388, %360
-  %391 = fmul float %388, %361
-  %392 = fmul float 2.000000e+00, %389
-  %393 = fmul float 2.000000e+00, %390
-  %394 = fmul float 2.000000e+00, %391
-  %395 = fsub float -0.000000e+00, %392
-  %396 = fadd float %381, %395
-  %397 = fsub float -0.000000e+00, %393
-  %398 = fadd float %382, %397
-  %399 = fsub float -0.000000e+00, %394
-  %400 = fadd float %383, %399
-  %401 = fmul float %396, %98
-  %402 = fmul float %396, %99
-  %403 = fmul float %396, %100
-  %404 = fmul float %398, %101
-  %405 = fadd float %404, %401
-  %406 = fmul float %398, %102
-  %407 = fadd float %406, %402
-  %408 = fmul float %398, %103
-  %409 = fadd float %408, %403
-  %410 = fmul float %400, %104
-  %411 = fadd float %410, %405
-  %412 = fmul float %400, %105
-  %413 = fadd float %412, %407
-  %414 = fmul float %400, %106
-  %415 = fadd float %414, %409
-  %416 = bitcast float %136 to i32
-  %417 = bitcast float %182 to i32
-  %418 = bitcast float %137 to i32
-  %419 = bitcast float %183 to i32
-  %420 = bitcast float %temp28.1 to i32
-  %421 = bitcast float %temp29.1 to i32
-  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
-  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
-  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
-  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
-  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
-  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
-  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
-  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
-  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
-  %431 = extractelement <4 x float> %430, i32 0
-  %432 = extractelement <4 x float> %430, i32 1
-  %433 = extractelement <4 x float> %430, i32 2
-  %434 = fmul float %48, %411
-  %435 = fmul float %49, %411
-  %436 = fmul float %50, %411
-  %437 = fmul float %51, %413
-  %438 = fadd float %437, %434
-  %439 = fmul float %52, %413
-  %440 = fadd float %439, %435
-  %441 = fmul float %53, %413
-  %442 = fadd float %441, %436
-  %443 = fmul float %54, %415
-  %444 = fadd float %443, %438
-  %445 = fmul float %55, %415
-  %446 = fadd float %445, %440
-  %447 = fmul float %56, %415
-  %448 = fadd float %447, %442
-  %449 = insertelement <4 x float> undef, float %444, i32 0
-  %450 = insertelement <4 x float> %449, float %446, i32 1
-  %451 = insertelement <4 x float> %450, float %448, i32 2
-  %452 = insertelement <4 x float> %451, float %195, i32 3
-  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
-  %454 = extractelement <4 x float> %453, i32 0
-  %455 = extractelement <4 x float> %453, i32 1
-  %456 = extractelement <4 x float> %453, i32 2
-  %457 = extractelement <4 x float> %453, i32 3
-  %458 = call float @fabs(float %456)
-  %459 = fdiv float 1.000000e+00, %458
-  %460 = fmul float %454, %459
-  %461 = fadd float %460, 1.500000e+00
-  %462 = fmul float %455, %459
-  %463 = fadd float %462, 1.500000e+00
-  %464 = bitcast float %463 to i32
-  %465 = bitcast float %461 to i32
-  %466 = bitcast float %457 to i32
-  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
-  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
-  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
-  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
-  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
-  %472 = extractelement <4 x float> %471, i32 0
-  %473 = extractelement <4 x float> %471, i32 1
-  %474 = extractelement <4 x float> %471, i32 2
-  %475 = fmul float %431, %472
-  %476 = fadd float %475, %329
-  %477 = fmul float %432, %473
-  %478 = fadd float %477, %330
-  %479 = fmul float %433, %474
-  %480 = fadd float %479, %331
-  %481 = fmul float %107, %107
-  %482 = fmul float %108, %108
-  %483 = fadd float %482, %481
-  %484 = fmul float %109, %109
-  %485 = fadd float %483, %484
-  %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
-  %487 = fmul float %107, %486
-  %488 = fmul float %108, %486
-  %489 = fmul float %109, %486
-  %490 = fmul float %377, %40
-  %491 = fmul float %378, %41
-  %492 = fmul float %379, %42
-  %493 = fmul float %359, %487
-  %494 = fmul float %360, %488
-  %495 = fadd float %494, %493
-  %496 = fmul float %361, %489
-  %497 = fadd float %495, %496
-  %498 = fmul float %497, %359
-  %499 = fmul float %497, %360
-  %500 = fmul float %497, %361
-  %501 = fmul float 2.000000e+00, %498
-  %502 = fmul float 2.000000e+00, %499
-  %503 = fmul float 2.000000e+00, %500
-  %504 = fsub float -0.000000e+00, %501
-  %505 = fadd float %487, %504
-  %506 = fsub float -0.000000e+00, %502
-  %507 = fadd float %488, %506
-  %508 = fsub float -0.000000e+00, %503
-  %509 = fadd float %489, %508
-  %510 = fmul float %95, %95
-  %511 = fmul float %96, %96
-  %512 = fadd float %511, %510
-  %513 = fmul float %97, %97
-  %514 = fadd float %512, %513
-  %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
-  %516 = fmul float %95, %515
-  %517 = fmul float %96, %515
-  %518 = fmul float %97, %515
-  %519 = fmul float %505, %516
-  %520 = fmul float %507, %517
-  %521 = fadd float %520, %519
-  %522 = fmul float %509, %518
-  %523 = fadd float %521, %522
-  %524 = fsub float -0.000000e+00, %523
-  %525 = fcmp uge float %524, 0.000000e+00
-  %526 = select i1 %525, float %524, float 0.000000e+00
-  %527 = fmul float %43, %380
-  %528 = fadd float %527, 1.000000e+00
-  %529 = call float @llvm.pow.f32(float %526, float %528)
-  %530 = fmul float %476, %37
-  %531 = fmul float %478, %38
-  %532 = fmul float %480, %39
-  %533 = fmul float %359, %487
-  %534 = fmul float %360, %488
-  %535 = fadd float %534, %533
-  %536 = fmul float %361, %489
-  %537 = fadd float %535, %536
-  %538 = fcmp uge float %537, 0.000000e+00
-  %539 = select i1 %538, float %537, float 0.000000e+00
-  %540 = fmul float %530, %539
-  %541 = fmul float %531, %539
-  %542 = fmul float %532, %539
-  %543 = fmul float %490, %529
-  %544 = fadd float %543, %540
-  %545 = fmul float %491, %529
-  %546 = fadd float %545, %541
-  %547 = fmul float %492, %529
-  %548 = fadd float %547, %542
-  %549 = fmul float %476, %34
-  %550 = fmul float %478, %35
-  %551 = fmul float %480, %36
-  %552 = fmul float %544, %57
-  %553 = fadd float %552, %549
-  %554 = fmul float %546, %58
-  %555 = fadd float %554, %550
-  %556 = fmul float %548, %59
-  %557 = fadd float %556, %551
-  %558 = bitcast float %136 to i32
-  %559 = bitcast float %182 to i32
-  %560 = bitcast float %137 to i32
-  %561 = bitcast float %183 to i32
-  %562 = bitcast float %temp28.1 to i32
-  %563 = bitcast float %temp29.1 to i32
-  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
-  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
-  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
-  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
-  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
-  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
-  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
-  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
-  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
-  %573 = extractelement <4 x float> %572, i32 0
-  %574 = extractelement <4 x float> %572, i32 1
-  %575 = extractelement <4 x float> %572, i32 2
-  %576 = fmul float %573, %44
-  %577 = fadd float %576, %553
-  %578 = fmul float %574, %45
-  %579 = fadd float %578, %555
-  %580 = fmul float %575, %46
-  %581 = fadd float %580, %557
-  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
-  %583 = bitcast i32 %582 to float
-  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
-  %585 = bitcast i32 %584 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+  %tmp263 = bitcast float %tmp135 to i32
+  %tmp264 = bitcast float %tmp181 to i32
+  %tmp265 = bitcast float %tmp136 to i32
+  %tmp266 = bitcast float %tmp182 to i32
+  %tmp267 = bitcast float %temp28.1 to i32
+  %tmp268 = bitcast float %temp29.1 to i32
+  %tmp269 = insertelement <8 x i32> undef, i32 %tmp263, i32 0
+  %tmp270 = insertelement <8 x i32> %tmp269, i32 %tmp264, i32 1
+  %tmp271 = insertelement <8 x i32> %tmp270, i32 %tmp265, i32 2
+  %tmp272 = insertelement <8 x i32> %tmp271, i32 %tmp266, i32 3
+  %tmp273 = insertelement <8 x i32> %tmp272, i32 %tmp267, i32 4
+  %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5
+  %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6
+  %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7
+  %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32>
+  %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp278 = extractelement <4 x float> %tmp277, i32 0
+  %tmp279 = extractelement <4 x float> %tmp277, i32 1
+  %tmp280 = extractelement <4 x float> %tmp277, i32 2
+  %tmp281 = extractelement <4 x float> %tmp277, i32 3
+  %tmp282 = fmul float %tmp281, %tmp46
+  %tmp283 = bitcast float %tmp135 to i32
+  %tmp284 = bitcast float %tmp181 to i32
+  %tmp285 = bitcast float %tmp136 to i32
+  %tmp286 = bitcast float %tmp182 to i32
+  %tmp287 = bitcast float %temp28.1 to i32
+  %tmp288 = bitcast float %temp29.1 to i32
+  %tmp289 = insertelement <8 x i32> undef, i32 %tmp283, i32 0
+  %tmp290 = insertelement <8 x i32> %tmp289, i32 %tmp284, i32 1
+  %tmp291 = insertelement <8 x i32> %tmp290, i32 %tmp285, i32 2
+  %tmp292 = insertelement <8 x i32> %tmp291, i32 %tmp286, i32 3
+  %tmp293 = insertelement <8 x i32> %tmp292, i32 %tmp287, i32 4
+  %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5
+  %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6
+  %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7
+  %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32>
+  %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp298 = extractelement <4 x float> %tmp297, i32 0
+  %tmp299 = extractelement <4 x float> %tmp297, i32 1
+  %tmp300 = extractelement <4 x float> %tmp297, i32 2
+  %tmp301 = bitcast float %tmp135 to i32
+  %tmp302 = bitcast float %tmp181 to i32
+  %tmp303 = bitcast float %tmp136 to i32
+  %tmp304 = bitcast float %tmp182 to i32
+  %tmp305 = bitcast float %temp28.1 to i32
+  %tmp306 = bitcast float %temp29.1 to i32
+  %tmp307 = insertelement <8 x i32> undef, i32 %tmp301, i32 0
+  %tmp308 = insertelement <8 x i32> %tmp307, i32 %tmp302, i32 1
+  %tmp309 = insertelement <8 x i32> %tmp308, i32 %tmp303, i32 2
+  %tmp310 = insertelement <8 x i32> %tmp309, i32 %tmp304, i32 3
+  %tmp311 = insertelement <8 x i32> %tmp310, i32 %tmp305, i32 4
+  %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5
+  %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6
+  %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7
+  %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32>
+  %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp316 = extractelement <4 x float> %tmp315, i32 0
+  %tmp317 = extractelement <4 x float> %tmp315, i32 1
+  %tmp318 = extractelement <4 x float> %tmp315, i32 2
+  %tmp319 = fmul float %tmp316, %tmp22
+  %tmp320 = fmul float %tmp317, %tmp23
+  %tmp321 = fmul float %tmp318, %tmp24
+  %tmp322 = fmul float %tmp298, %tmp25
+  %tmp323 = fadd float %tmp322, %tmp319
+  %tmp324 = fmul float %tmp299, %tmp26
+  %tmp325 = fadd float %tmp324, %tmp320
+  %tmp326 = fmul float %tmp300, %tmp27
+  %tmp327 = fadd float %tmp326, %tmp321
+  %tmp328 = fadd float %tmp278, %tmp323
+  %tmp329 = fadd float %tmp279, %tmp325
+  %tmp330 = fadd float %tmp280, %tmp327
+  %tmp331 = bitcast float %tmp135 to i32
+  %tmp332 = bitcast float %tmp181 to i32
+  %tmp333 = bitcast float %tmp136 to i32
+  %tmp334 = bitcast float %tmp182 to i32
+  %tmp335 = bitcast float %temp28.1 to i32
+  %tmp336 = bitcast float %temp29.1 to i32
+  %tmp337 = insertelement <8 x i32> undef, i32 %tmp331, i32 0
+  %tmp338 = insertelement <8 x i32> %tmp337, i32 %tmp332, i32 1
+  %tmp339 = insertelement <8 x i32> %tmp338, i32 %tmp333, i32 2
+  %tmp340 = insertelement <8 x i32> %tmp339, i32 %tmp334, i32 3
+  %tmp341 = insertelement <8 x i32> %tmp340, i32 %tmp335, i32 4
+  %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5
+  %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6
+  %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7
+  %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp346 = extractelement <4 x float> %tmp345, i32 0
+  %tmp347 = extractelement <4 x float> %tmp345, i32 1
+  %tmp348 = extractelement <4 x float> %tmp345, i32 2
+  %tmp349 = fadd float %tmp346, -5.000000e-01
+  %tmp350 = fadd float %tmp347, -5.000000e-01
+  %tmp351 = fadd float %tmp348, -5.000000e-01
+  %tmp352 = fmul float %tmp349, %tmp349
+  %tmp353 = fmul float %tmp350, %tmp350
+  %tmp354 = fadd float %tmp353, %tmp352
+  %tmp355 = fmul float %tmp351, %tmp351
+  %tmp356 = fadd float %tmp354, %tmp355
+  %tmp357 = call float @llvm.amdgcn.rsq.f32(float %tmp356)
+  %tmp358 = fmul float %tmp349, %tmp357
+  %tmp359 = fmul float %tmp350, %tmp357
+  %tmp360 = fmul float %tmp351, %tmp357
+  %tmp361 = bitcast float %tmp135 to i32
+  %tmp362 = bitcast float %tmp181 to i32
+  %tmp363 = bitcast float %tmp136 to i32
+  %tmp364 = bitcast float %tmp182 to i32
+  %tmp365 = bitcast float %temp28.1 to i32
+  %tmp366 = bitcast float %temp29.1 to i32
+  %tmp367 = insertelement <8 x i32> undef, i32 %tmp361, i32 0
+  %tmp368 = insertelement <8 x i32> %tmp367, i32 %tmp362, i32 1
+  %tmp369 = insertelement <8 x i32> %tmp368, i32 %tmp363, i32 2
+  %tmp370 = insertelement <8 x i32> %tmp369, i32 %tmp364, i32 3
+  %tmp371 = insertelement <8 x i32> %tmp370, i32 %tmp365, i32 4
+  %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5
+  %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6
+  %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7
+  %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32>
+  %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp376 = extractelement <4 x float> %tmp375, i32 0
+  %tmp377 = extractelement <4 x float> %tmp375, i32 1
+  %tmp378 = extractelement <4 x float> %tmp375, i32 2
+  %tmp379 = extractelement <4 x float> %tmp375, i32 3
+  %tmp380 = fsub float -0.000000e+00, %tmp94
+  %tmp381 = fsub float -0.000000e+00, %tmp95
+  %tmp382 = fsub float -0.000000e+00, %tmp96
+  %tmp383 = fmul float %tmp358, %tmp380
+  %tmp384 = fmul float %tmp359, %tmp381
+  %tmp385 = fadd float %tmp384, %tmp383
+  %tmp386 = fmul float %tmp360, %tmp382
+  %tmp387 = fadd float %tmp385, %tmp386
+  %tmp388 = fmul float %tmp387, %tmp358
+  %tmp389 = fmul float %tmp387, %tmp359
+  %tmp390 = fmul float %tmp387, %tmp360
+  %tmp391 = fmul float 2.000000e+00, %tmp388
+  %tmp392 = fmul float 2.000000e+00, %tmp389
+  %tmp393 = fmul float 2.000000e+00, %tmp390
+  %tmp394 = fsub float -0.000000e+00, %tmp391
+  %tmp395 = fadd float %tmp380, %tmp394
+  %tmp396 = fsub float -0.000000e+00, %tmp392
+  %tmp397 = fadd float %tmp381, %tmp396
+  %tmp398 = fsub float -0.000000e+00, %tmp393
+  %tmp399 = fadd float %tmp382, %tmp398
+  %tmp400 = fmul float %tmp395, %tmp97
+  %tmp401 = fmul float %tmp395, %tmp98
+  %tmp402 = fmul float %tmp395, %tmp99
+  %tmp403 = fmul float %tmp397, %tmp100
+  %tmp404 = fadd float %tmp403, %tmp400
+  %tmp405 = fmul float %tmp397, %tmp101
+  %tmp406 = fadd float %tmp405, %tmp401
+  %tmp407 = fmul float %tmp397, %tmp102
+  %tmp408 = fadd float %tmp407, %tmp402
+  %tmp409 = fmul float %tmp399, %tmp103
+  %tmp410 = fadd float %tmp409, %tmp404
+  %tmp411 = fmul float %tmp399, %tmp104
+  %tmp412 = fadd float %tmp411, %tmp406
+  %tmp413 = fmul float %tmp399, %tmp105
+  %tmp414 = fadd float %tmp413, %tmp408
+  %tmp415 = bitcast float %tmp135 to i32
+  %tmp416 = bitcast float %tmp181 to i32
+  %tmp417 = bitcast float %tmp136 to i32
+  %tmp418 = bitcast float %tmp182 to i32
+  %tmp419 = bitcast float %temp28.1 to i32
+  %tmp420 = bitcast float %temp29.1 to i32
+  %tmp421 = insertelement <8 x i32> undef, i32 %tmp415, i32 0
+  %tmp422 = insertelement <8 x i32> %tmp421, i32 %tmp416, i32 1
+  %tmp423 = insertelement <8 x i32> %tmp422, i32 %tmp417, i32 2
+  %tmp424 = insertelement <8 x i32> %tmp423, i32 %tmp418, i32 3
+  %tmp425 = insertelement <8 x i32> %tmp424, i32 %tmp419, i32 4
+  %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5
+  %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6
+  %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7
+  %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32>
+  %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp430 = extractelement <4 x float> %tmp429, i32 0
+  %tmp431 = extractelement <4 x float> %tmp429, i32 1
+  %tmp432 = extractelement <4 x float> %tmp429, i32 2
+  %tmp433 = fmul float %tmp47, %tmp410
+  %tmp434 = fmul float %tmp48, %tmp410
+  %tmp435 = fmul float %tmp49, %tmp410
+  %tmp436 = fmul float %tmp50, %tmp412
+  %tmp437 = fadd float %tmp436, %tmp433
+  %tmp438 = fmul float %tmp51, %tmp412
+  %tmp439 = fadd float %tmp438, %tmp434
+  %tmp440 = fmul float %tmp52, %tmp412
+  %tmp441 = fadd float %tmp440, %tmp435
+  %tmp442 = fmul float %tmp53, %tmp414
+  %tmp443 = fadd float %tmp442, %tmp437
+  %tmp444 = fmul float %tmp54, %tmp414
+  %tmp445 = fadd float %tmp444, %tmp439
+  %tmp446 = fmul float %tmp55, %tmp414
+  %tmp447 = fadd float %tmp446, %tmp441
+  %tmp448 = insertelement <4 x float> undef, float %tmp443, i32 0
+  %tmp449 = insertelement <4 x float> %tmp448, float %tmp445, i32 1
+  %tmp450 = insertelement <4 x float> %tmp449, float %tmp447, i32 2
+  %tmp451 = insertelement <4 x float> %tmp450, float %tmp194, i32 3
+  %tmp452 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp451)
+  %tmp453 = extractelement <4 x float> %tmp452, i32 0
+  %tmp454 = extractelement <4 x float> %tmp452, i32 1
+  %tmp455 = extractelement <4 x float> %tmp452, i32 2
+  %tmp456 = extractelement <4 x float> %tmp452, i32 3
+  %tmp457 = call float @fabs(float %tmp455)
+  %tmp458 = fdiv float 1.000000e+00, %tmp457
+  %tmp459 = fmul float %tmp453, %tmp458
+  %tmp460 = fadd float %tmp459, 1.500000e+00
+  %tmp461 = fmul float %tmp454, %tmp458
+  %tmp462 = fadd float %tmp461, 1.500000e+00
+  %tmp463 = bitcast float %tmp462 to i32
+  %tmp464 = bitcast float %tmp460 to i32
+  %tmp465 = bitcast float %tmp456 to i32
+  %tmp466 = insertelement <4 x i32> undef, i32 %tmp463, i32 0
+  %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1
+  %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2
+  %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3
+  %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32>
+  %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp471 = extractelement <4 x float> %tmp470, i32 0
+  %tmp472 = extractelement <4 x float> %tmp470, i32 1
+  %tmp473 = extractelement <4 x float> %tmp470, i32 2
+  %tmp474 = fmul float %tmp430, %tmp471
+  %tmp475 = fadd float %tmp474, %tmp328
+  %tmp476 = fmul float %tmp431, %tmp472
+  %tmp477 = fadd float %tmp476, %tmp329
+  %tmp478 = fmul float %tmp432, %tmp473
+  %tmp479 = fadd float %tmp478, %tmp330
+  %tmp480 = fmul float %tmp106, %tmp106
+  %tmp481 = fmul float %tmp107, %tmp107
+  %tmp482 = fadd float %tmp481, %tmp480
+  %tmp483 = fmul float %tmp108, %tmp108
+  %tmp484 = fadd float %tmp482, %tmp483
+  %tmp485 = call float @llvm.amdgcn.rsq.f32(float %tmp484)
+  %tmp486 = fmul float %tmp106, %tmp485
+  %tmp487 = fmul float %tmp107, %tmp485
+  %tmp488 = fmul float %tmp108, %tmp485
+  %tmp489 = fmul float %tmp376, %tmp39
+  %tmp490 = fmul float %tmp377, %tmp40
+  %tmp491 = fmul float %tmp378, %tmp41
+  %tmp492 = fmul float %tmp358, %tmp486
+  %tmp493 = fmul float %tmp359, %tmp487
+  %tmp494 = fadd float %tmp493, %tmp492
+  %tmp495 = fmul float %tmp360, %tmp488
+  %tmp496 = fadd float %tmp494, %tmp495
+  %tmp497 = fmul float %tmp496, %tmp358
+  %tmp498 = fmul float %tmp496, %tmp359
+  %tmp499 = fmul float %tmp496, %tmp360
+  %tmp500 = fmul float 2.000000e+00, %tmp497
+  %tmp501 = fmul float 2.000000e+00, %tmp498
+  %tmp502 = fmul float 2.000000e+00, %tmp499
+  %tmp503 = fsub float -0.000000e+00, %tmp500
+  %tmp504 = fadd float %tmp486, %tmp503
+  %tmp505 = fsub float -0.000000e+00, %tmp501
+  %tmp506 = fadd float %tmp487, %tmp505
+  %tmp507 = fsub float -0.000000e+00, %tmp502
+  %tmp508 = fadd float %tmp488, %tmp507
+  %tmp509 = fmul float %tmp94, %tmp94
+  %tmp510 = fmul float %tmp95, %tmp95
+  %tmp511 = fadd float %tmp510, %tmp509
+  %tmp512 = fmul float %tmp96, %tmp96
+  %tmp513 = fadd float %tmp511, %tmp512
+  %tmp514 = call float @llvm.amdgcn.rsq.f32(float %tmp513)
+  %tmp515 = fmul float %tmp94, %tmp514
+  %tmp516 = fmul float %tmp95, %tmp514
+  %tmp517 = fmul float %tmp96, %tmp514
+  %tmp518 = fmul float %tmp504, %tmp515
+  %tmp519 = fmul float %tmp506, %tmp516
+  %tmp520 = fadd float %tmp519, %tmp518
+  %tmp521 = fmul float %tmp508, %tmp517
+  %tmp522 = fadd float %tmp520, %tmp521
+  %tmp523 = fsub float -0.000000e+00, %tmp522
+  %tmp524 = fcmp uge float %tmp523, 0.000000e+00
+  %tmp525 = select i1 %tmp524, float %tmp523, float 0.000000e+00
+  %tmp526 = fmul float %tmp42, %tmp379
+  %tmp527 = fadd float %tmp526, 1.000000e+00
+  %tmp528 = call float @llvm.pow.f32(float %tmp525, float %tmp527)
+  %tmp529 = fmul float %tmp475, %tmp36
+  %tmp530 = fmul float %tmp477, %tmp37
+  %tmp531 = fmul float %tmp479, %tmp38
+  %tmp532 = fmul float %tmp358, %tmp486
+  %tmp533 = fmul float %tmp359, %tmp487
+  %tmp534 = fadd float %tmp533, %tmp532
+  %tmp535 = fmul float %tmp360, %tmp488
+  %tmp536 = fadd float %tmp534, %tmp535
+  %tmp537 = fcmp uge float %tmp536, 0.000000e+00
+  %tmp538 = select i1 %tmp537, float %tmp536, float 0.000000e+00
+  %tmp539 = fmul float %tmp529, %tmp538
+  %tmp540 = fmul float %tmp530, %tmp538
+  %tmp541 = fmul float %tmp531, %tmp538
+  %tmp542 = fmul float %tmp489, %tmp528
+  %tmp543 = fadd float %tmp542, %tmp539
+  %tmp544 = fmul float %tmp490, %tmp528
+  %tmp545 = fadd float %tmp544, %tmp540
+  %tmp546 = fmul float %tmp491, %tmp528
+  %tmp547 = fadd float %tmp546, %tmp541
+  %tmp548 = fmul float %tmp475, %tmp33
+  %tmp549 = fmul float %tmp477, %tmp34
+  %tmp550 = fmul float %tmp479, %tmp35
+  %tmp551 = fmul float %tmp543, %tmp56
+  %tmp552 = fadd float %tmp551, %tmp548
+  %tmp553 = fmul float %tmp545, %tmp57
+  %tmp554 = fadd float %tmp553, %tmp549
+  %tmp555 = fmul float %tmp547, %tmp58
+  %tmp556 = fadd float %tmp555, %tmp550
+  %tmp557 = bitcast float %tmp135 to i32
+  %tmp558 = bitcast float %tmp181 to i32
+  %tmp559 = bitcast float %tmp136 to i32
+  %tmp560 = bitcast float %tmp182 to i32
+  %tmp561 = bitcast float %temp28.1 to i32
+  %tmp562 = bitcast float %temp29.1 to i32
+  %tmp563 = insertelement <8 x i32> undef, i32 %tmp557, i32 0
+  %tmp564 = insertelement <8 x i32> %tmp563, i32 %tmp558, i32 1
+  %tmp565 = insertelement <8 x i32> %tmp564, i32 %tmp559, i32 2
+  %tmp566 = insertelement <8 x i32> %tmp565, i32 %tmp560, i32 3
+  %tmp567 = insertelement <8 x i32> %tmp566, i32 %tmp561, i32 4
+  %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5
+  %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6
+  %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7
+  %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32>
+  %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp572 = extractelement <4 x float> %tmp571, i32 0
+  %tmp573 = extractelement <4 x float> %tmp571, i32 1
+  %tmp574 = extractelement <4 x float> %tmp571, i32 2
+  %tmp575 = fmul float %tmp572, %tmp43
+  %tmp576 = fadd float %tmp575, %tmp552
+  %tmp577 = fmul float %tmp573, %tmp44
+  %tmp578 = fadd float %tmp577, %tmp554
+  %tmp579 = fmul float %tmp574, %tmp45
+  %tmp580 = fadd float %tmp579, %tmp556
+  %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
+  %tmp582 = bitcast i32 %tmp581 to float
+  %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
+  %tmp584 = bitcast i32 %tmp583 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584)
   ret void
 
 ENDIF66:                                          ; preds = %LOOP65
-  %586 = bitcast float %temp28.1 to i32
-  %587 = bitcast float %temp29.1 to i32
-  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
-  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
-  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
-  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
-  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
-  %593 = extractelement <4 x float> %592, i32 3
-  %594 = fcmp oge float %temp30.1, %593
-  %595 = sext i1 %594 to i32
-  %596 = bitcast i32 %595 to float
-  %597 = bitcast float %596 to i32
-  %598 = and i32 %597, 1065353216
-  %599 = bitcast i32 %598 to float
-  %600 = fmul float 5.000000e-01, %temp32.0
-  %601 = fsub float -0.000000e+00, %600
-  %602 = fmul float %599, %temp32.0
-  %603 = fadd float %602, %601
-  %604 = fmul float %214, %603
-  %605 = fadd float %604, %temp28.1
-  %606 = fmul float %215, %603
-  %607 = fadd float %606, %temp29.1
-  %608 = fmul float %216, %603
-  %609 = fadd float %608, %temp30.1
-  %610 = fadd float %temp24.1, 1.000000e+00
-  %611 = fmul float %temp32.0, 5.000000e-01
+  %tmp585 = bitcast float %temp28.1 to i32
+  %tmp586 = bitcast float %temp29.1 to i32
+  %tmp587 = insertelement <8 x i32> %tmp236, i32 %tmp585, i32 4
+  %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5
+  %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6
+  %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7
+  %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp592 = extractelement <4 x float> %tmp591, i32 3
+  %tmp593 = fcmp oge float %temp30.1, %tmp592
+  %tmp594 = sext i1 %tmp593 to i32
+  %tmp595 = bitcast i32 %tmp594 to float
+  %tmp596 = bitcast float %tmp595 to i32
+  %tmp597 = and i32 %tmp596, 1065353216
+  %tmp598 = bitcast i32 %tmp597 to float
+  %tmp599 = fmul float 5.000000e-01, %temp32.0
+  %tmp600 = fsub float -0.000000e+00, %tmp599
+  %tmp601 = fmul float %tmp598, %temp32.0
+  %tmp602 = fadd float %tmp601, %tmp600
+  %tmp603 = fmul float %tmp213, %tmp602
+  %tmp604 = fadd float %tmp603, %temp28.1
+  %tmp605 = fmul float %tmp214, %tmp602
+  %tmp606 = fadd float %tmp605, %temp29.1
+  %tmp607 = fmul float %tmp215, %tmp602
+  %tmp608 = fadd float %tmp607, %temp30.1
+  %tmp609 = fadd float %temp24.1, 1.000000e+00
+  %tmp610 = fmul float %temp32.0, 5.000000e-01
   br label %LOOP65
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: readnone
-declare i32 @llvm.SI.tid() #2
-
-; Function Attrs: readonly
-declare float @ceil(float) #3
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
-
-; Function Attrs: readnone
-declare float @fabs(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
-attributes #3 = { readonly }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null, i32 1}
-
 ; CHECK-LABEL: {{^}}main1:
 ; CHECK: s_endpgm
-define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
-  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
-  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
-  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
-  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
-  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
-  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
-  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
-  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
-  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
-  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
-  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
-  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
-  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
-  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
-  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
-  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
-  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
-  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
-  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
-  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
-  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
-  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
-  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
-  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
-  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
-  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
-  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
-  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
-  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
-  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
-  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
-  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
-  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
-  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
-  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
-  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
-  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
-  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
-  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
-  %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
-  %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
-  %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
-  %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
-  %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
-  %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
-  %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
-  %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
-  %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
-  %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
-  %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
-  %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
-  %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
-  %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
-  %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
-  %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
-  %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
-  %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
-  %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
-  %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
-  %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
-  %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
-  %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
-  %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
-  %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
-  %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
-  %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
-  %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
-  %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
-  %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
-  %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
-  %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
-  %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
-  %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
-  %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
-  %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
-  %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
-  %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
-  %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
-  %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
-  %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
-  %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
-  %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
-  %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
-  %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
-  %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
-  %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
-  %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
-  %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
-  %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
-  %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
-  %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
-  %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
-  %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
-  %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
-  %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
-  %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
-  %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
-  %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
-  %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
-  %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
-  %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
-  %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
-  %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
-  %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
-  %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
-  %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
-  %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
-  %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
-  %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
-  %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
-  %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
-  %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
-  %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
-  %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
-  %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
-  %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
-  %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
-  %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
-  %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
-  %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
-  %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
-  %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
-  %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
-  %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
-  %162 = fcmp ugt float %17, 0.000000e+00
-  %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
-  %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
-  %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
-  %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
-  %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
-  %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
-  %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
-  %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
-  %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
-  %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
-  %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
-  %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
-  %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
-  %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
-  %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
-  %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
-  %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
-  %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
-  %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
-  %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
-  %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
-  %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
-  %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
-  %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
-  %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
-  %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
-  %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
-  %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
-  %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
-  %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
-  %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
-  %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
-  %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
-  %196 = fmul float %14, %124
-  %197 = fadd float %196, %125
-  %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %202 = bitcast float %198 to i32
-  %203 = icmp ne i32 %202, 0
-  %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
-  %204 = fsub float -0.000000e+00, %164
-  %205 = fadd float %44, %204
-  %206 = fsub float -0.000000e+00, %165
-  %207 = fadd float %45, %206
-  %208 = fsub float -0.000000e+00, %166
-  %209 = fadd float %46, %208
-  %210 = fmul float %205, %205
-  %211 = fmul float %207, %207
-  %212 = fadd float %211, %210
-  %213 = fmul float %209, %209
-  %214 = fadd float %212, %213
-  %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
-  %216 = fmul float %205, %215
-  %217 = fmul float %207, %215
-  %218 = fmul float %209, %215
-  %219 = fmul float %., %54
-  %220 = fmul float %13, %47
-  %221 = fmul float %197, %48
-  %222 = bitcast float %174 to i32
-  %223 = bitcast float %175 to i32
-  %224 = insertelement <2 x i32> undef, i32 %222, i32 0
-  %225 = insertelement <2 x i32> %224, i32 %223, i32 1
-  %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
-  %227 = extractelement <4 x float> %226, i32 0
-  %228 = extractelement <4 x float> %226, i32 1
-  %229 = extractelement <4 x float> %226, i32 2
-  %230 = extractelement <4 x float> %226, i32 3
-  %231 = fmul float %227, 0x4012611180000000
-  %232 = fmul float %228, 0x4012611180000000
-  %233 = fmul float %229, 0x4012611180000000
-  %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
-  %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
-  %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
-  %237 = fmul float %216, %184
-  %238 = fmul float %217, %185
-  %239 = fadd float %238, %237
-  %240 = fmul float %218, %186
-  %241 = fadd float %239, %240
-  %242 = fmul float %216, %187
-  %243 = fmul float %217, %188
-  %244 = fadd float %243, %242
-  %245 = fmul float %218, %189
-  %246 = fadd float %244, %245
-  %247 = fmul float %216, %190
-  %248 = fmul float %217, %191
-  %249 = fadd float %248, %247
-  %250 = fmul float %218, %192
-  %251 = fadd float %249, %250
-  %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
-  %253 = fmul float %214, 0x3F5A36E2E0000000
-  %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
-  %255 = fsub float -0.000000e+00, %254
-  %256 = fadd float 1.000000e+00, %255
-  %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
-  %258 = fmul float %39, %257
-  %259 = fmul float %241, %258
-  %260 = fmul float %246, %258
-  %261 = fmul float %259, %230
-  %262 = fmul float %260, %230
-  %263 = fadd float %252, 0x3EE4F8B580000000
-  %264 = fsub float -0.000000e+00, %252
-  %265 = fadd float 1.000000e+00, %264
-  %266 = fmul float 1.200000e+01, %265
-  %267 = fadd float %266, 4.000000e+00
-  %268 = fsub float -0.000000e+00, %267
-  %269 = fmul float %268, %263
-  %270 = fsub float -0.000000e+00, %267
-  %271 = fmul float %270, %263
-  %272 = fsub float -0.000000e+00, %267
-  %273 = fmul float %272, %263
-  %274 = fdiv float 1.000000e+00, %269
-  %275 = fdiv float 1.000000e+00, %271
-  %276 = fdiv float 1.000000e+00, %273
-  %277 = fmul float %261, %274
-  %278 = fmul float %262, %275
-  %279 = fmul float %263, %276
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 0)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 4)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 8)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 12)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 28)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 48)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 52)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 56)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 64)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 68)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 72)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 76)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
+  %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
+  %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 148)
+  %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 152)
+  %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
+  %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 164)
+  %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 168)
+  %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 172)
+  %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
+  %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
+  %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
+  %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
+  %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
+  %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
+  %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
+  %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
+  %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
+  %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 220)
+  %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 236)
+  %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
+  %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
+  %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
+  %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 252)
+  %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
+  %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 260)
+  %tmp60 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 264)
+  %tmp61 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 268)
+  %tmp62 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
+  %tmp63 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
+  %tmp64 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
+  %tmp65 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 284)
+  %tmp66 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
+  %tmp67 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
+  %tmp68 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 464)
+  %tmp69 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 468)
+  %tmp70 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 472)
+  %tmp71 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 496)
+  %tmp72 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 500)
+  %tmp73 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 504)
+  %tmp74 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 512)
+  %tmp75 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 516)
+  %tmp76 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 524)
+  %tmp77 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 532)
+  %tmp78 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 536)
+  %tmp79 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 540)
+  %tmp80 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 544)
+  %tmp81 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 548)
+  %tmp82 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 552)
+  %tmp83 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 556)
+  %tmp84 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 560)
+  %tmp85 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 564)
+  %tmp86 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 568)
+  %tmp87 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 572)
+  %tmp88 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 576)
+  %tmp89 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 580)
+  %tmp90 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 584)
+  %tmp91 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 588)
+  %tmp92 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 592)
+  %tmp93 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 596)
+  %tmp94 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 600)
+  %tmp95 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 604)
+  %tmp96 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 608)
+  %tmp97 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 612)
+  %tmp98 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 616)
+  %tmp99 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 624)
+  %tmp100 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 628)
+  %tmp101 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 632)
+  %tmp102 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 636)
+  %tmp103 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 640)
+  %tmp104 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 644)
+  %tmp105 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 648)
+  %tmp106 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 652)
+  %tmp107 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 656)
+  %tmp108 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 660)
+  %tmp109 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 664)
+  %tmp110 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 668)
+  %tmp111 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 672)
+  %tmp112 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 676)
+  %tmp113 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 680)
+  %tmp114 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 684)
+  %tmp115 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 688)
+  %tmp116 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 692)
+  %tmp117 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 696)
+  %tmp118 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 700)
+  %tmp119 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 704)
+  %tmp120 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 708)
+  %tmp121 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 712)
+  %tmp122 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 716)
+  %tmp123 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 864)
+  %tmp124 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 868)
+  %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+  %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0
+  %tmp127 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp128 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp127, !tbaa !0
+  %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
+  %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0
+  %tmp131 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
+  %tmp132 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp131, !tbaa !0
+  %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
+  %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0
+  %tmp135 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
+  %tmp136 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp135, !tbaa !0
+  %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
+  %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0
+  %tmp139 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
+  %tmp140 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp139, !tbaa !0
+  %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
+  %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0
+  %tmp143 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
+  %tmp144 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp143, !tbaa !0
+  %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
+  %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0
+  %tmp147 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
+  %tmp148 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp147, !tbaa !0
+  %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
+  %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0
+  %tmp151 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
+  %tmp152 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp151, !tbaa !0
+  %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
+  %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0
+  %tmp155 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
+  %tmp156 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp155, !tbaa !0
+  %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8
+  %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0
+  %tmp159 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 8
+  %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0
+  %tmp161 = fcmp ugt float %arg17, 0.000000e+00
+  %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
+  %tmp163 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp164 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp165 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp166 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg4, <2 x i32> %arg6)
+  %tmp167 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp168 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp169 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp170 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %arg4, <2 x i32> %arg6)
+  %tmp171 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp172 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp173 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp174 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %arg4, <2 x i32> %arg6)
+  %tmp175 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp176 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp177 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp178 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %arg4, <2 x i32> %arg6)
+  %tmp179 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp180 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp181 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp182 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg4, <2 x i32> %arg6)
+  %tmp183 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp184 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp185 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp186 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %tmp187 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %arg4, <2 x i32> %arg6)
+  %tmp188 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %arg4, <2 x i32> %arg6)
+  %tmp189 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %arg4, <2 x i32> %arg6)
+  %tmp190 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %arg4, <2 x i32> %arg6)
+  %tmp191 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %tmp192 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %tmp193 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %tmp194 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %tmp195 = fmul float %arg14, %tmp123
+  %tmp196 = fadd float %tmp195, %tmp124
+  %tmp197 = call float @llvm.AMDGPU.clamp.f32(float %tmp162, float 0.000000e+00, float 1.000000e+00)
+  %tmp198 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %tmp199 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %tmp200 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %tmp201 = bitcast float %tmp197 to i32
+  %tmp202 = icmp ne i32 %tmp201, 0
+  %. = select i1 %tmp202, float -1.000000e+00, float 1.000000e+00
+  %tmp203 = fsub float -0.000000e+00, %tmp163
+  %tmp204 = fadd float %tmp43, %tmp203
+  %tmp205 = fsub float -0.000000e+00, %tmp164
+  %tmp206 = fadd float %tmp44, %tmp205
+  %tmp207 = fsub float -0.000000e+00, %tmp165
+  %tmp208 = fadd float %tmp45, %tmp207
+  %tmp209 = fmul float %tmp204, %tmp204
+  %tmp210 = fmul float %tmp206, %tmp206
+  %tmp211 = fadd float %tmp210, %tmp209
+  %tmp212 = fmul float %tmp208, %tmp208
+  %tmp213 = fadd float %tmp211, %tmp212
+  %tmp214 = call float @llvm.amdgcn.rsq.f32(float %tmp213)
+  %tmp215 = fmul float %tmp204, %tmp214
+  %tmp216 = fmul float %tmp206, %tmp214
+  %tmp217 = fmul float %tmp208, %tmp214
+  %tmp218 = fmul float %., %tmp53
+  %tmp219 = fmul float %arg13, %tmp46
+  %tmp220 = fmul float %tmp196, %tmp47
+  %tmp221 = bitcast float %tmp173 to i32
+  %tmp222 = bitcast float %tmp174 to i32
+  %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0
+  %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1
+  %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32>
+  %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp226 = extractelement <4 x float> %tmp225, i32 0
+  %tmp227 = extractelement <4 x float> %tmp225, i32 1
+  %tmp228 = extractelement <4 x float> %tmp225, i32 2
+  %tmp229 = extractelement <4 x float> %tmp225, i32 3
+  %tmp230 = fmul float %tmp226, 0x4012611180000000
+  %tmp231 = fmul float %tmp227, 0x4012611180000000
+  %tmp232 = fmul float %tmp228, 0x4012611180000000
+  %one.sub.a.i = fsub float 1.000000e+00, %tmp26
+  %result.i = fadd float %tmp230, %one.sub.a.i
+  %one.sub.a.i43 = fsub float 1.000000e+00, %tmp26
+  %result.i44 = fadd float %tmp231, %one.sub.a.i43
+  %one.sub.a.i41 = fsub float 1.000000e+00, %tmp26
+  %result.i42 = fadd float %tmp232, %one.sub.a.i41
+  %tmp233 = fmul float %tmp215, %tmp183
+  %tmp234 = fmul float %tmp216, %tmp184
+  %tmp235 = fadd float %tmp234, %tmp233
+  %tmp236 = fmul float %tmp217, %tmp185
+  %tmp237 = fadd float %tmp235, %tmp236
+  %tmp238 = fmul float %tmp215, %tmp186
+  %tmp239 = fmul float %tmp216, %tmp187
+  %tmp240 = fadd float %tmp239, %tmp238
+  %tmp241 = fmul float %tmp217, %tmp188
+  %tmp242 = fadd float %tmp240, %tmp241
+  %tmp243 = fmul float %tmp215, %tmp189
+  %tmp244 = fmul float %tmp216, %tmp190
+  %tmp245 = fadd float %tmp244, %tmp243
+  %tmp246 = fmul float %tmp217, %tmp191
+  %tmp247 = fadd float %tmp245, %tmp246
+  %tmp248 = call float @llvm.AMDGPU.clamp.f32(float %tmp247, float 0.000000e+00, float 1.000000e+00)
+  %tmp249 = fmul float %tmp213, 0x3F5A36E2E0000000
+  %tmp250 = call float @llvm.AMDGPU.clamp.f32(float %tmp249, float 0.000000e+00, float 1.000000e+00)
+  %tmp251 = fsub float -0.000000e+00, %tmp250
+  %tmp252 = fadd float 1.000000e+00, %tmp251
+  %tmp253 = call float @llvm.pow.f32(float %tmp248, float 2.500000e-01)
+  %tmp254 = fmul float %tmp38, %tmp253
+  %tmp255 = fmul float %tmp237, %tmp254
+  %tmp256 = fmul float %tmp242, %tmp254
+  %tmp257 = fmul float %tmp255, %tmp229
+  %tmp258 = fmul float %tmp256, %tmp229
+  %tmp259 = fadd float %tmp248, 0x3EE4F8B580000000
+  %tmp260 = fsub float -0.000000e+00, %tmp248
+  %tmp261 = fadd float 1.000000e+00, %tmp260
+  %tmp262 = fmul float 1.200000e+01, %tmp261
+  %tmp263 = fadd float %tmp262, 4.000000e+00
+  %tmp264 = fsub float -0.000000e+00, %tmp263
+  %tmp265 = fmul float %tmp264, %tmp259
+  %tmp266 = fsub float -0.000000e+00, %tmp263
+  %tmp267 = fmul float %tmp266, %tmp259
+  %tmp268 = fsub float -0.000000e+00, %tmp263
+  %tmp269 = fmul float %tmp268, %tmp259
+  %tmp270 = fdiv float 1.000000e+00, %tmp265
+  %tmp271 = fdiv float 1.000000e+00, %tmp267
+  %tmp272 = fdiv float 1.000000e+00, %tmp269
+  %tmp273 = fmul float %tmp257, %tmp270
+  %tmp274 = fmul float %tmp258, %tmp271
+  %tmp275 = fmul float %tmp259, %tmp272
   br label %LOOP
 
 LOOP:                                             ; preds = %LOOP, %main_body
-  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
-  %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
-  %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
-  %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
-  %280 = bitcast float %temp168.0 to i32
-  %281 = bitcast float %temp169.0 to i32
-  %282 = insertelement <4 x i32> undef, i32 %280, i32 0
-  %283 = insertelement <4 x i32> %282, i32 %281, i32 1
-  %284 = insertelement <4 x i32> %283, i32 0, i32 2
-  %285 = insertelement <4 x i32> %284, i32 undef, i32 3
-  %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
-  %287 = extractelement <4 x float> %286, i32 3
-  %288 = fadd float %temp168.0, %277
-  %289 = fadd float %temp169.0, %278
-  %290 = fadd float %temp170.0, %279
-  %291 = fsub float -0.000000e+00, %287
-  %292 = fadd float %290, %291
-  %293 = fcmp oge float 0.000000e+00, %292
-  %294 = sext i1 %293 to i32
-  %295 = bitcast i32 %294 to float
-  %296 = bitcast float %295 to i32
-  %297 = icmp ne i32 %296, 0
-  br i1 %297, label %IF189, label %LOOP
+  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp288, %LOOP ]
+  %temp168.0 = phi float [ %tmp175, %main_body ], [ %tmp284, %LOOP ]
+  %temp169.0 = phi float [ %tmp176, %main_body ], [ %tmp285, %LOOP ]
+  %temp170.0 = phi float [ %tmp252, %main_body ], [ %tmp286, %LOOP ]
+  %tmp276 = bitcast float %temp168.0 to i32
+  %tmp277 = bitcast float %temp169.0 to i32
+  %tmp278 = insertelement <4 x i32> undef, i32 %tmp276, i32 0
+  %tmp279 = insertelement <4 x i32> %tmp278, i32 %tmp277, i32 1
+  %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2
+  %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3
+  %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32>
+  %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp283 = extractelement <4 x float> %tmp282, i32 3
+  %tmp284 = fadd float %temp168.0, %tmp273
+  %tmp285 = fadd float %temp169.0, %tmp274
+  %tmp286 = fadd float %temp170.0, %tmp275
+  %tmp287 = fsub float -0.000000e+00, %tmp283
+  %tmp288 = fadd float %tmp286, %tmp287
+  %tmp289 = fcmp oge float 0.000000e+00, %tmp288
+  %tmp290 = sext i1 %tmp289 to i32
+  %tmp291 = bitcast i32 %tmp290 to float
+  %tmp292 = bitcast float %tmp291 to i32
+  %tmp293 = icmp ne i32 %tmp292, 0
+  br i1 %tmp293, label %IF189, label %LOOP
 
 IF189:                                            ; preds = %LOOP
-  %298 = extractelement <4 x float> %286, i32 0
-  %299 = extractelement <4 x float> %286, i32 1
-  %300 = extractelement <4 x float> %286, i32 2
-  %301 = fsub float -0.000000e+00, %292
-  %302 = fadd float %temp144.0, %301
-  %303 = fdiv float 1.000000e+00, %302
-  %304 = fmul float %292, %303
-  %305 = fadd float %304, -1.000000e+00
-  %306 = fmul float %305, %277
-  %307 = fadd float %306, %288
-  %308 = fmul float %305, %278
-  %309 = fadd float %308, %289
-  %310 = fsub float -0.000000e+00, %176
-  %311 = fadd float %307, %310
-  %312 = fsub float -0.000000e+00, %177
-  %313 = fadd float %309, %312
-  %314 = fadd float %176, %311
-  %315 = fadd float %177, %313
-  %316 = fmul float %311, %67
-  %317 = fmul float %313, %68
-  %318 = fmul float %316, %55
-  %319 = fmul float %316, %56
-  %320 = fmul float %317, %57
-  %321 = fadd float %320, %318
-  %322 = fmul float %317, %58
-  %323 = fadd float %322, %319
-  %324 = fadd float %178, %321
-  %325 = fadd float %179, %323
-  %326 = fmul float %316, %59
-  %327 = fmul float %316, %60
-  %328 = fmul float %316, %61
-  %329 = fmul float %316, %62
-  %330 = fmul float %317, %63
-  %331 = fadd float %330, %326
-  %332 = fmul float %317, %64
-  %333 = fadd float %332, %327
-  %334 = fmul float %317, %65
-  %335 = fadd float %334, %328
-  %336 = fmul float %317, %66
-  %337 = fadd float %336, %329
-  %338 = fadd float %168, %331
-  %339 = fadd float %169, %333
-  %340 = fadd float %170, %335
-  %341 = fadd float %171, %337
-  %342 = bitcast float %338 to i32
-  %343 = bitcast float %339 to i32
-  %344 = insertelement <2 x i32> undef, i32 %342, i32 0
-  %345 = insertelement <2 x i32> %344, i32 %343, i32 1
-  %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
-  %347 = extractelement <4 x float> %346, i32 0
-  %348 = extractelement <4 x float> %346, i32 1
-  %349 = extractelement <4 x float> %346, i32 2
-  %350 = extractelement <4 x float> %346, i32 3
-  %351 = fmul float %347, %23
-  %352 = fmul float %348, %24
-  %353 = fmul float %349, %25
-  %354 = fmul float %350, %26
-  %355 = fmul float %351, %180
-  %356 = fmul float %352, %181
-  %357 = fmul float %353, %182
-  %358 = fmul float %354, %183
-  %359 = fsub float -0.000000e+00, %350
-  %360 = fadd float 1.000000e+00, %359
-  %361 = fmul float %360, %49
-  %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
-  %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
-  %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
-  %365 = bitcast float %340 to i32
-  %366 = bitcast float %341 to i32
-  %367 = insertelement <2 x i32> undef, i32 %365, i32 0
-  %368 = insertelement <2 x i32> %367, i32 %366, i32 1
-  %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
-  %370 = extractelement <4 x float> %369, i32 2
-  %371 = fmul float %362, %234
-  %372 = fmul float %363, %235
-  %373 = fmul float %364, %236
-  %374 = fmul float %358, %230
-  %375 = bitcast float %314 to i32
-  %376 = bitcast float %315 to i32
-  %377 = insertelement <2 x i32> undef, i32 %375, i32 0
-  %378 = insertelement <2 x i32> %377, i32 %376, i32 1
-  %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
-  %380 = extractelement <4 x float> %379, i32 0
-  %381 = extractelement <4 x float> %379, i32 1
-  %382 = extractelement <4 x float> %379, i32 2
-  %383 = extractelement <4 x float> %379, i32 3
-  %384 = fcmp olt float 0.000000e+00, %382
-  %385 = sext i1 %384 to i32
-  %386 = bitcast i32 %385 to float
-  %387 = bitcast float %386 to i32
-  %388 = icmp ne i32 %387, 0
-  %.224 = select i1 %388, float %381, float %380
-  %.225 = select i1 %388, float %383, float %381
-  %389 = bitcast float %324 to i32
-  %390 = bitcast float %325 to i32
-  %391 = insertelement <2 x i32> undef, i32 %389, i32 0
-  %392 = insertelement <2 x i32> %391, i32 %390, i32 1
-  %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
-  %394 = extractelement <4 x float> %393, i32 0
-  %395 = extractelement <4 x float> %393, i32 1
-  %396 = extractelement <4 x float> %393, i32 2
-  %397 = extractelement <4 x float> %393, i32 3
-  %398 = fcmp olt float 0.000000e+00, %396
-  %399 = sext i1 %398 to i32
-  %400 = bitcast i32 %399 to float
-  %401 = bitcast float %400 to i32
-  %402 = icmp ne i32 %401, 0
-  %temp112.1 = select i1 %402, float %395, float %394
-  %temp113.1 = select i1 %402, float %397, float %395
-  %403 = fmul float %.224, 2.000000e+00
-  %404 = fadd float %403, -1.000000e+00
-  %405 = fmul float %.225, 2.000000e+00
-  %406 = fadd float %405, -1.000000e+00
-  %407 = fmul float %temp112.1, 2.000000e+00
-  %408 = fadd float %407, -1.000000e+00
-  %409 = fmul float %temp113.1, 2.000000e+00
-  %410 = fadd float %409, -1.000000e+00
-  %411 = fsub float -0.000000e+00, %404
-  %412 = fmul float %411, %35
-  %413 = fsub float -0.000000e+00, %406
-  %414 = fmul float %413, %35
-  %415 = fsub float -0.000000e+00, %408
-  %416 = fmul float %415, %36
-  %417 = fsub float -0.000000e+00, %410
-  %418 = fmul float %417, %36
-  %419 = fmul float %416, %370
-  %420 = fmul float %418, %370
-  %421 = call float @fabs(float %412)
-  %422 = call float @fabs(float %414)
-  %423 = fsub float -0.000000e+00, %421
-  %424 = fadd float 1.000000e+00, %423
-  %425 = fsub float -0.000000e+00, %422
-  %426 = fadd float 1.000000e+00, %425
-  %427 = fmul float %424, %419
-  %428 = fadd float %427, %412
-  %429 = fmul float %426, %420
-  %430 = fadd float %429, %414
-  %431 = fmul float %428, %428
-  %432 = fmul float %430, %430
-  %433 = fadd float %431, %432
-  %434 = fsub float -0.000000e+00, %433
-  %435 = fadd float 0x3FF00068E0000000, %434
-  %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
-  %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
-  %438 = fmul float %437, %436
-  %439 = fsub float -0.000000e+00, %436
-  %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
-  %441 = fmul float %184, %428
-  %442 = fmul float %185, %428
-  %443 = fmul float %186, %428
-  %444 = fmul float %187, %430
-  %445 = fadd float %444, %441
-  %446 = fmul float %188, %430
-  %447 = fadd float %446, %442
-  %448 = fmul float %189, %430
-  %449 = fadd float %448, %443
-  %450 = fmul float %190, %440
-  %451 = fadd float %450, %445
-  %452 = fmul float %191, %440
-  %453 = fadd float %452, %447
-  %454 = fmul float %192, %440
-  %455 = fadd float %454, %449
-  %456 = fmul float %451, %451
-  %457 = fmul float %453, %453
-  %458 = fadd float %457, %456
-  %459 = fmul float %455, %455
-  %460 = fadd float %458, %459
-  %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
-  %462 = fmul float %451, %461
-  %463 = fmul float %453, %461
-  %464 = fmul float %455, %461
-  %465 = fcmp olt float 0.000000e+00, %219
-  %466 = sext i1 %465 to i32
-  %467 = bitcast i32 %466 to float
-  %468 = bitcast float %467 to i32
-  %469 = icmp ne i32 %468, 0
-  br i1 %469, label %IF198, label %ENDIF197
+  %tmp294 = extractelement <4 x float> %tmp282, i32 0
+  %tmp295 = extractelement <4 x float> %tmp282, i32 1
+  %tmp296 = extractelement <4 x float> %tmp282, i32 2
+  %tmp297 = fsub float -0.000000e+00, %tmp288
+  %tmp298 = fadd float %temp144.0, %tmp297
+  %tmp299 = fdiv float 1.000000e+00, %tmp298
+  %tmp300 = fmul float %tmp288, %tmp299
+  %tmp301 = fadd float %tmp300, -1.000000e+00
+  %tmp302 = fmul float %tmp301, %tmp273
+  %tmp303 = fadd float %tmp302, %tmp284
+  %tmp304 = fmul float %tmp301, %tmp274
+  %tmp305 = fadd float %tmp304, %tmp285
+  %tmp306 = fsub float -0.000000e+00, %tmp175
+  %tmp307 = fadd float %tmp303, %tmp306
+  %tmp308 = fsub float -0.000000e+00, %tmp176
+  %tmp309 = fadd float %tmp305, %tmp308
+  %tmp310 = fadd float %tmp175, %tmp307
+  %tmp311 = fadd float %tmp176, %tmp309
+  %tmp312 = fmul float %tmp307, %tmp66
+  %tmp313 = fmul float %tmp309, %tmp67
+  %tmp314 = fmul float %tmp312, %tmp54
+  %tmp315 = fmul float %tmp312, %tmp55
+  %tmp316 = fmul float %tmp313, %tmp56
+  %tmp317 = fadd float %tmp316, %tmp314
+  %tmp318 = fmul float %tmp313, %tmp57
+  %tmp319 = fadd float %tmp318, %tmp315
+  %tmp320 = fadd float %tmp177, %tmp317
+  %tmp321 = fadd float %tmp178, %tmp319
+  %tmp322 = fmul float %tmp312, %tmp58
+  %tmp323 = fmul float %tmp312, %tmp59
+  %tmp324 = fmul float %tmp312, %tmp60
+  %tmp325 = fmul float %tmp312, %tmp61
+  %tmp326 = fmul float %tmp313, %tmp62
+  %tmp327 = fadd float %tmp326, %tmp322
+  %tmp328 = fmul float %tmp313, %tmp63
+  %tmp329 = fadd float %tmp328, %tmp323
+  %tmp330 = fmul float %tmp313, %tmp64
+  %tmp331 = fadd float %tmp330, %tmp324
+  %tmp332 = fmul float %tmp313, %tmp65
+  %tmp333 = fadd float %tmp332, %tmp325
+  %tmp334 = fadd float %tmp167, %tmp327
+  %tmp335 = fadd float %tmp168, %tmp329
+  %tmp336 = fadd float %tmp169, %tmp331
+  %tmp337 = fadd float %tmp170, %tmp333
+  %tmp338 = bitcast float %tmp334 to i32
+  %tmp339 = bitcast float %tmp335 to i32
+  %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0
+  %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1
+  %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32>
+  %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp343 = extractelement <4 x float> %tmp342, i32 0
+  %tmp344 = extractelement <4 x float> %tmp342, i32 1
+  %tmp345 = extractelement <4 x float> %tmp342, i32 2
+  %tmp346 = extractelement <4 x float> %tmp342, i32 3
+  %tmp347 = fmul float %tmp343, %tmp22
+  %tmp348 = fmul float %tmp344, %tmp23
+  %tmp349 = fmul float %tmp345, %tmp24
+  %tmp350 = fmul float %tmp346, %tmp25
+  %tmp351 = fmul float %tmp347, %tmp179
+  %tmp352 = fmul float %tmp348, %tmp180
+  %tmp353 = fmul float %tmp349, %tmp181
+  %tmp354 = fmul float %tmp350, %tmp182
+  %tmp355 = fsub float -0.000000e+00, %tmp346
+  %tmp356 = fadd float 1.000000e+00, %tmp355
+  %tmp357 = fmul float %tmp356, %tmp48
+  %one.sub.a.i37 = fsub float 1.000000e+00, %tmp357
+  %one.sub.ac.i38 = fmul float %one.sub.a.i37, %tmp351
+  %mul.i39 = fmul float %tmp343, %tmp351
+  %result.i40 = fadd float %mul.i39, %one.sub.ac.i38
+  %one.sub.a.i33 = fsub float 1.000000e+00, %tmp357
+  %one.sub.ac.i34 = fmul float %one.sub.a.i33, %tmp352
+  %mul.i35 = fmul float %tmp344, %tmp352
+  %result.i36 = fadd float %mul.i35, %one.sub.ac.i34
+  %one.sub.a.i29 = fsub float 1.000000e+00, %tmp357
+  %one.sub.ac.i30 = fmul float %one.sub.a.i29, %tmp353
+  %mul.i31 = fmul float %tmp345, %tmp353
+  %result.i32 = fadd float %mul.i31, %one.sub.ac.i30
+  %tmp358 = bitcast float %tmp336 to i32
+  %tmp359 = bitcast float %tmp337 to i32
+  %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0
+  %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1
+  %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32>
+  %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp363 = extractelement <4 x float> %tmp362, i32 2
+  %tmp364 = fmul float %result.i40, %result.i
+  %tmp365 = fmul float %result.i36, %result.i44
+  %tmp366 = fmul float %result.i32, %result.i42
+  %tmp367 = fmul float %tmp354, %tmp229
+  %tmp368 = bitcast float %tmp310 to i32
+  %tmp369 = bitcast float %tmp311 to i32
+  %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0
+  %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1
+  %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32>
+  %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp373 = extractelement <4 x float> %tmp372, i32 0
+  %tmp374 = extractelement <4 x float> %tmp372, i32 1
+  %tmp375 = extractelement <4 x float> %tmp372, i32 2
+  %tmp376 = extractelement <4 x float> %tmp372, i32 3
+  %tmp377 = fcmp olt float 0.000000e+00, %tmp375
+  %tmp378 = sext i1 %tmp377 to i32
+  %tmp379 = bitcast i32 %tmp378 to float
+  %tmp380 = bitcast float %tmp379 to i32
+  %tmp381 = icmp ne i32 %tmp380, 0
+  %.224 = select i1 %tmp381, float %tmp374, float %tmp373
+  %.225 = select i1 %tmp381, float %tmp376, float %tmp374
+  %tmp382 = bitcast float %tmp320 to i32
+  %tmp383 = bitcast float %tmp321 to i32
+  %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0
+  %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1
+  %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32>
+  %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp387 = extractelement <4 x float> %tmp386, i32 0
+  %tmp388 = extractelement <4 x float> %tmp386, i32 1
+  %tmp389 = extractelement <4 x float> %tmp386, i32 2
+  %tmp390 = extractelement <4 x float> %tmp386, i32 3
+  %tmp391 = fcmp olt float 0.000000e+00, %tmp389
+  %tmp392 = sext i1 %tmp391 to i32
+  %tmp393 = bitcast i32 %tmp392 to float
+  %tmp394 = bitcast float %tmp393 to i32
+  %tmp395 = icmp ne i32 %tmp394, 0
+  %temp112.1 = select i1 %tmp395, float %tmp388, float %tmp387
+  %temp113.1 = select i1 %tmp395, float %tmp390, float %tmp388
+  %tmp396 = fmul float %.224, 2.000000e+00
+  %tmp397 = fadd float %tmp396, -1.000000e+00
+  %tmp398 = fmul float %.225, 2.000000e+00
+  %tmp399 = fadd float %tmp398, -1.000000e+00
+  %tmp400 = fmul float %temp112.1, 2.000000e+00
+  %tmp401 = fadd float %tmp400, -1.000000e+00
+  %tmp402 = fmul float %temp113.1, 2.000000e+00
+  %tmp403 = fadd float %tmp402, -1.000000e+00
+  %tmp404 = fsub float -0.000000e+00, %tmp397
+  %tmp405 = fmul float %tmp404, %tmp34
+  %tmp406 = fsub float -0.000000e+00, %tmp399
+  %tmp407 = fmul float %tmp406, %tmp34
+  %tmp408 = fsub float -0.000000e+00, %tmp401
+  %tmp409 = fmul float %tmp408, %tmp35
+  %tmp410 = fsub float -0.000000e+00, %tmp403
+  %tmp411 = fmul float %tmp410, %tmp35
+  %tmp412 = fmul float %tmp409, %tmp363
+  %tmp413 = fmul float %tmp411, %tmp363
+  %tmp414 = call float @fabs(float %tmp405)
+  %tmp415 = call float @fabs(float %tmp407)
+  %tmp416 = fsub float -0.000000e+00, %tmp414
+  %tmp417 = fadd float 1.000000e+00, %tmp416
+  %tmp418 = fsub float -0.000000e+00, %tmp415
+  %tmp419 = fadd float 1.000000e+00, %tmp418
+  %tmp420 = fmul float %tmp417, %tmp412
+  %tmp421 = fadd float %tmp420, %tmp405
+  %tmp422 = fmul float %tmp419, %tmp413
+  %tmp423 = fadd float %tmp422, %tmp407
+  %tmp424 = fmul float %tmp421, %tmp421
+  %tmp425 = fmul float %tmp423, %tmp423
+  %tmp426 = fadd float %tmp424, %tmp425
+  %tmp427 = fsub float -0.000000e+00, %tmp426
+  %tmp428 = fadd float 0x3FF00068E0000000, %tmp427
+  %tmp429 = call float @llvm.AMDGPU.clamp.f32(float %tmp428, float 0.000000e+00, float 1.000000e+00)
+  %tmp430 = call float @llvm.amdgcn.rsq.f32(float %tmp429)
+  %tmp431 = fmul float %tmp430, %tmp429
+  %tmp432 = fsub float -0.000000e+00, %tmp429
+  %cmp = fcmp ogt float 0.000000e+00, %tmp432
+  %tmp433 = select i1 %cmp, float %tmp431, float 0.000000e+00
+  %tmp434 = fmul float %tmp183, %tmp421
+  %tmp435 = fmul float %tmp184, %tmp421
+  %tmp436 = fmul float %tmp185, %tmp421
+  %tmp437 = fmul float %tmp186, %tmp423
+  %tmp438 = fadd float %tmp437, %tmp434
+  %tmp439 = fmul float %tmp187, %tmp423
+  %tmp440 = fadd float %tmp439, %tmp435
+  %tmp441 = fmul float %tmp188, %tmp423
+  %tmp442 = fadd float %tmp441, %tmp436
+  %tmp443 = fmul float %tmp189, %tmp433
+  %tmp444 = fadd float %tmp443, %tmp438
+  %tmp445 = fmul float %tmp190, %tmp433
+  %tmp446 = fadd float %tmp445, %tmp440
+  %tmp447 = fmul float %tmp191, %tmp433
+  %tmp448 = fadd float %tmp447, %tmp442
+  %tmp449 = fmul float %tmp444, %tmp444
+  %tmp450 = fmul float %tmp446, %tmp446
+  %tmp451 = fadd float %tmp450, %tmp449
+  %tmp452 = fmul float %tmp448, %tmp448
+  %tmp453 = fadd float %tmp451, %tmp452
+  %tmp454 = call float @llvm.amdgcn.rsq.f32(float %tmp453)
+  %tmp455 = fmul float %tmp444, %tmp454
+  %tmp456 = fmul float %tmp446, %tmp454
+  %tmp457 = fmul float %tmp448, %tmp454
+  %tmp458 = fcmp olt float 0.000000e+00, %tmp218
+  %tmp459 = sext i1 %tmp458 to i32
+  %tmp460 = bitcast i32 %tmp459 to float
+  %tmp461 = bitcast float %tmp460 to i32
+  %tmp462 = icmp ne i32 %tmp461, 0
+  br i1 %tmp462, label %IF198, label %ENDIF197
 
 IF198:                                            ; preds = %IF189
-  %470 = fsub float -0.000000e+00, %462
-  %471 = fsub float -0.000000e+00, %463
-  %472 = fsub float -0.000000e+00, %464
+  %tmp463 = fsub float -0.000000e+00, %tmp455
+  %tmp464 = fsub float -0.000000e+00, %tmp456
+  %tmp465 = fsub float -0.000000e+00, %tmp457
   br label %ENDIF197
 
-ENDIF197:                                         ; preds = %IF189, %IF198
-  %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
-  %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
-  %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
-  %473 = bitcast float %220 to i32
-  %474 = bitcast float %221 to i32
-  %475 = insertelement <2 x i32> undef, i32 %473, i32 0
-  %476 = insertelement <2 x i32> %475, i32 %474, i32 1
-  %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
-  %478 = extractelement <4 x float> %477, i32 0
-  %479 = extractelement <4 x float> %477, i32 1
-  %480 = extractelement <4 x float> %477, i32 2
-  %481 = extractelement <4 x float> %477, i32 3
-  %482 = fmul float %478, %40
-  %483 = fadd float %482, %41
-  %484 = fmul float %479, %40
-  %485 = fadd float %484, %41
-  %486 = fmul float %480, %40
-  %487 = fadd float %486, %41
-  %488 = fmul float %481, %42
-  %489 = fadd float %488, %43
-  %490 = bitcast float %172 to i32
-  %491 = bitcast float %173 to i32
-  %492 = insertelement <2 x i32> undef, i32 %490, i32 0
-  %493 = insertelement <2 x i32> %492, i32 %491, i32 1
-  %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
-  %495 = extractelement <4 x float> %494, i32 0
-  %496 = extractelement <4 x float> %494, i32 1
-  %497 = extractelement <4 x float> %494, i32 2
-  %498 = extractelement <4 x float> %494, i32 3
-  %499 = fmul float %498, 3.200000e+01
-  %500 = fadd float %499, -1.600000e+01
-  %501 = call float @llvm.AMDIL.exp.(float %500)
-  %502 = fmul float %495, %501
-  %503 = fmul float %496, %501
-  %504 = fmul float %497, %501
-  %505 = fmul float %28, %502
-  %506 = fadd float %505, %193
-  %507 = fmul float %29, %503
-  %508 = fadd float %507, %194
-  %509 = fmul float %30, %504
-  %510 = fadd float %509, %195
-  %511 = fmul float %506, %489
-  %512 = fmul float %508, %489
-  %513 = fmul float %510, %489
-  %514 = fmul float %489, 5.000000e-01
-  %515 = fadd float %514, 5.000000e-01
-  %516 = fmul float %483, %515
-  %517 = fadd float %516, %511
-  %518 = fmul float %485, %515
-  %519 = fadd float %518, %512
-  %520 = fmul float %487, %515
-  %521 = fadd float %520, %513
-  %522 = fmul float %517, %371
-  %523 = fmul float %519, %372
-  %524 = fmul float %521, %373
-  %525 = fmul float %428, 0x3FDB272440000000
-  %526 = fmul float %430, 0xBFDB272440000000
-  %527 = fadd float %526, %525
-  %528 = fmul float %440, 0x3FE99999A0000000
-  %529 = fadd float %527, %528
-  %530 = fmul float %529, 5.000000e-01
-  %531 = fadd float %530, 0x3FE3333340000000
-  %532 = fmul float %531, %531
-  %533 = fmul float %522, %532
-  %534 = fmul float %523, %532
-  %535 = fmul float %524, %532
-  %536 = fsub float -0.000000e+00, %72
-  %537 = fsub float -0.000000e+00, %73
-  %538 = fsub float -0.000000e+00, %74
-  %539 = fmul float %temp12.0, %536
-  %540 = fmul float %temp13.0, %537
-  %541 = fadd float %540, %539
-  %542 = fmul float %temp14.0, %538
-  %543 = fadd float %541, %542
-  %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
-  %545 = fmul float %371, %544
-  %546 = fmul float %372, %544
-  %547 = fmul float %373, %544
-  %548 = fmul float %545, %69
-  %549 = fmul float %546, %70
-  %550 = fmul float %547, %71
-  %551 = fsub float -0.000000e+00, %164
-  %552 = fadd float %97, %551
-  %553 = fsub float -0.000000e+00, %165
-  %554 = fadd float %98, %553
-  %555 = fsub float -0.000000e+00, %166
-  %556 = fadd float %99, %555
-  %557 = fmul float %552, %552
-  %558 = fmul float %554, %554
-  %559 = fadd float %558, %557
-  %560 = fmul float %556, %556
-  %561 = fadd float %559, %560
-  %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
-  %563 = fmul float %562, %561
-  %564 = fsub float -0.000000e+00, %561
-  %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
-  %566 = fsub float -0.000000e+00, %84
-  %567 = fadd float %565, %566
-  %568 = fsub float -0.000000e+00, %83
-  %569 = fadd float %565, %568
-  %570 = fsub float -0.000000e+00, %82
-  %571 = fadd float %565, %570
-  %572 = fsub float -0.000000e+00, %84
-  %573 = fadd float %83, %572
-  %574 = fsub float -0.000000e+00, %83
-  %575 = fadd float %82, %574
-  %576 = fsub float -0.000000e+00, %82
-  %577 = fadd float %81, %576
-  %578 = fdiv float 1.000000e+00, %573
-  %579 = fdiv float 1.000000e+00, %575
-  %580 = fdiv float 1.000000e+00, %577
-  %581 = fmul float %567, %578
-  %582 = fmul float %569, %579
-  %583 = fmul float %571, %580
-  %584 = fcmp olt float %565, %83
-  %585 = sext i1 %584 to i32
-  %586 = bitcast i32 %585 to float
-  %587 = bitcast float %586 to i32
-  %588 = icmp ne i32 %587, 0
-  br i1 %588, label %ENDIF200, label %ELSE202
+ENDIF197:                                         ; preds = %IF198, %IF189
+  %temp14.0 = phi float [ %tmp465, %IF198 ], [ %tmp457, %IF189 ]
+  %temp13.0 = phi float [ %tmp464, %IF198 ], [ %tmp456, %IF189 ]
+  %temp12.0 = phi float [ %tmp463, %IF198 ], [ %tmp455, %IF189 ]
+  %tmp466 = bitcast float %tmp219 to i32
+  %tmp467 = bitcast float %tmp220 to i32
+  %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0
+  %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1
+  %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32>
+  %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp471 = extractelement <4 x float> %tmp470, i32 0
+  %tmp472 = extractelement <4 x float> %tmp470, i32 1
+  %tmp473 = extractelement <4 x float> %tmp470, i32 2
+  %tmp474 = extractelement <4 x float> %tmp470, i32 3
+  %tmp475 = fmul float %tmp471, %tmp39
+  %tmp476 = fadd float %tmp475, %tmp40
+  %tmp477 = fmul float %tmp472, %tmp39
+  %tmp478 = fadd float %tmp477, %tmp40
+  %tmp479 = fmul float %tmp473, %tmp39
+  %tmp480 = fadd float %tmp479, %tmp40
+  %tmp481 = fmul float %tmp474, %tmp41
+  %tmp482 = fadd float %tmp481, %tmp42
+  %tmp483 = bitcast float %tmp171 to i32
+  %tmp484 = bitcast float %tmp172 to i32
+  %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0
+  %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1
+  %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32>
+  %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp488 = extractelement <4 x float> %tmp487, i32 0
+  %tmp489 = extractelement <4 x float> %tmp487, i32 1
+  %tmp490 = extractelement <4 x float> %tmp487, i32 2
+  %tmp491 = extractelement <4 x float> %tmp487, i32 3
+  %tmp492 = fmul float %tmp491, 3.200000e+01
+  %tmp493 = fadd float %tmp492, -1.600000e+01
+  %tmp494 = call float @llvm.exp2.f32(float %tmp493)
+  %tmp495 = fmul float %tmp488, %tmp494
+  %tmp496 = fmul float %tmp489, %tmp494
+  %tmp497 = fmul float %tmp490, %tmp494
+  %tmp498 = fmul float %tmp27, %tmp495
+  %tmp499 = fadd float %tmp498, %tmp192
+  %tmp500 = fmul float %tmp28, %tmp496
+  %tmp501 = fadd float %tmp500, %tmp193
+  %tmp502 = fmul float %tmp29, %tmp497
+  %tmp503 = fadd float %tmp502, %tmp194
+  %tmp504 = fmul float %tmp499, %tmp482
+  %tmp505 = fmul float %tmp501, %tmp482
+  %tmp506 = fmul float %tmp503, %tmp482
+  %tmp507 = fmul float %tmp482, 5.000000e-01
+  %tmp508 = fadd float %tmp507, 5.000000e-01
+  %tmp509 = fmul float %tmp476, %tmp508
+  %tmp510 = fadd float %tmp509, %tmp504
+  %tmp511 = fmul float %tmp478, %tmp508
+  %tmp512 = fadd float %tmp511, %tmp505
+  %tmp513 = fmul float %tmp480, %tmp508
+  %tmp514 = fadd float %tmp513, %tmp506
+  %tmp515 = fmul float %tmp510, %tmp364
+  %tmp516 = fmul float %tmp512, %tmp365
+  %tmp517 = fmul float %tmp514, %tmp366
+  %tmp518 = fmul float %tmp421, 0x3FDB272440000000
+  %tmp519 = fmul float %tmp423, 0xBFDB272440000000
+  %tmp520 = fadd float %tmp519, %tmp518
+  %tmp521 = fmul float %tmp433, 0x3FE99999A0000000
+  %tmp522 = fadd float %tmp520, %tmp521
+  %tmp523 = fmul float %tmp522, 5.000000e-01
+  %tmp524 = fadd float %tmp523, 0x3FE3333340000000
+  %tmp525 = fmul float %tmp524, %tmp524
+  %tmp526 = fmul float %tmp515, %tmp525
+  %tmp527 = fmul float %tmp516, %tmp525
+  %tmp528 = fmul float %tmp517, %tmp525
+  %tmp529 = fsub float -0.000000e+00, %tmp71
+  %tmp530 = fsub float -0.000000e+00, %tmp72
+  %tmp531 = fsub float -0.000000e+00, %tmp73
+  %tmp532 = fmul float %temp12.0, %tmp529
+  %tmp533 = fmul float %temp13.0, %tmp530
+  %tmp534 = fadd float %tmp533, %tmp532
+  %tmp535 = fmul float %temp14.0, %tmp531
+  %tmp536 = fadd float %tmp534, %tmp535
+  %tmp537 = call float @llvm.AMDGPU.clamp.f32(float %tmp536, float 0.000000e+00, float 1.000000e+00)
+  %tmp538 = fmul float %tmp364, %tmp537
+  %tmp539 = fmul float %tmp365, %tmp537
+  %tmp540 = fmul float %tmp366, %tmp537
+  %tmp541 = fmul float %tmp538, %tmp68
+  %tmp542 = fmul float %tmp539, %tmp69
+  %tmp543 = fmul float %tmp540, %tmp70
+  %tmp544 = fsub float -0.000000e+00, %tmp163
+  %tmp545 = fadd float %tmp96, %tmp544
+  %tmp546 = fsub float -0.000000e+00, %tmp164
+  %tmp547 = fadd float %tmp97, %tmp546
+  %tmp548 = fsub float -0.000000e+00, %tmp165
+  %tmp549 = fadd float %tmp98, %tmp548
+  %tmp550 = fmul float %tmp545, %tmp545
+  %tmp551 = fmul float %tmp547, %tmp547
+  %tmp552 = fadd float %tmp551, %tmp550
+  %tmp553 = fmul float %tmp549, %tmp549
+  %tmp554 = fadd float %tmp552, %tmp553
+  %tmp555 = call float @llvm.amdgcn.rsq.f32(float %tmp554)
+  %tmp556 = fmul float %tmp555, %tmp554
+  %tmp557 = fsub float -0.000000e+00, %tmp554
+  %cmp1 = fcmp ogt float %tmp557, 0.000000e+00
+  %tmp558 = select i1 %cmp1, float %tmp556, float 0.000000e+00
+  %tmp559 = fsub float -0.000000e+00, %tmp83
+  %tmp560 = fadd float %tmp558, %tmp559
+  %tmp561 = fsub float -0.000000e+00, %tmp82
+  %tmp562 = fadd float %tmp558, %tmp561
+  %tmp563 = fsub float -0.000000e+00, %tmp81
+  %tmp564 = fadd float %tmp558, %tmp563
+  %tmp565 = fsub float -0.000000e+00, %tmp83
+  %tmp566 = fadd float %tmp82, %tmp565
+  %tmp567 = fsub float -0.000000e+00, %tmp82
+  %tmp568 = fadd float %tmp81, %tmp567
+  %tmp569 = fsub float -0.000000e+00, %tmp81
+  %tmp570 = fadd float %tmp80, %tmp569
+  %tmp571 = fdiv float 1.000000e+00, %tmp566
+  %tmp572 = fdiv float 1.000000e+00, %tmp568
+  %tmp573 = fdiv float 1.000000e+00, %tmp570
+  %tmp574 = fmul float %tmp560, %tmp571
+  %tmp575 = fmul float %tmp562, %tmp572
+  %tmp576 = fmul float %tmp564, %tmp573
+  %tmp577 = fcmp olt float %tmp558, %tmp82
+  %tmp578 = sext i1 %tmp577 to i32
+  %tmp579 = bitcast i32 %tmp578 to float
+  %tmp580 = bitcast float %tmp579 to i32
+  %tmp581 = icmp ne i32 %tmp580, 0
+  br i1 %tmp581, label %ENDIF200, label %ELSE202
 
 ELSE202:                                          ; preds = %ENDIF197
-  %589 = fcmp olt float %565, %82
-  %590 = sext i1 %589 to i32
-  %591 = bitcast i32 %590 to float
-  %592 = bitcast float %591 to i32
-  %593 = icmp ne i32 %592, 0
-  br i1 %593, label %ENDIF200, label %ELSE205
+  %tmp582 = fcmp olt float %tmp558, %tmp81
+  %tmp583 = sext i1 %tmp582 to i32
+  %tmp584 = bitcast i32 %tmp583 to float
+  %tmp585 = bitcast float %tmp584 to i32
+  %tmp586 = icmp ne i32 %tmp585, 0
+  br i1 %tmp586, label %ENDIF200, label %ELSE205
 
 ENDIF200:                                         ; preds = %ELSE205, %ELSE202, %ENDIF197
-  %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
-  %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
-  %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
-  %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
-  %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
-  %594 = fcmp olt float %565, %83
-  %595 = sext i1 %594 to i32
-  %596 = bitcast i32 %595 to float
-  %597 = bitcast float %596 to i32
-  %598 = icmp ne i32 %597, 0
-  br i1 %598, label %ENDIF209, label %ELSE211
+  %temp80.0 = phi float [ %tmp574, %ENDIF197 ], [ %.226, %ELSE205 ], [ %tmp575, %ELSE202 ]
+  %temp88.0 = phi float [ %tmp121, %ENDIF197 ], [ %.227, %ELSE205 ], [ %tmp119, %ELSE202 ]
+  %temp89.0 = phi float [ %tmp122, %ENDIF197 ], [ %.228, %ELSE205 ], [ %tmp120, %ELSE202 ]
+  %temp90.0 = phi float [ %tmp119, %ENDIF197 ], [ %tmp115, %ELSE205 ], [ %tmp117, %ELSE202 ]
+  %temp91.0 = phi float [ %tmp120, %ENDIF197 ], [ %tmp116, %ELSE205 ], [ %tmp118, %ELSE202 ]
+  %tmp587 = fcmp olt float %tmp558, %tmp82
+  %tmp588 = sext i1 %tmp587 to i32
+  %tmp589 = bitcast i32 %tmp588 to float
+  %tmp590 = bitcast float %tmp589 to i32
+  %tmp591 = icmp ne i32 %tmp590, 0
+  br i1 %tmp591, label %ENDIF209, label %ELSE211
 
 ELSE205:                                          ; preds = %ELSE202
-  %599 = fcmp olt float %565, %81
-  %600 = sext i1 %599 to i32
-  %601 = bitcast i32 %600 to float
-  %602 = bitcast float %601 to i32
-  %603 = icmp ne i32 %602, 0
-  %.226 = select i1 %603, float %583, float 1.000000e+00
-  %.227 = select i1 %603, float %118, float %116
-  %.228 = select i1 %603, float %119, float %117
+  %tmp592 = fcmp olt float %tmp558, %tmp80
+  %tmp593 = sext i1 %tmp592 to i32
+  %tmp594 = bitcast i32 %tmp593 to float
+  %tmp595 = bitcast float %tmp594 to i32
+  %tmp596 = icmp ne i32 %tmp595, 0
+  %.226 = select i1 %tmp596, float %tmp576, float 1.000000e+00
+  %.227 = select i1 %tmp596, float %tmp117, float %tmp115
+  %.228 = select i1 %tmp596, float %tmp118, float %tmp116
   br label %ENDIF200
 
 ELSE211:                                          ; preds = %ENDIF200
-  %604 = fcmp olt float %565, %82
-  %605 = sext i1 %604 to i32
-  %606 = bitcast i32 %605 to float
-  %607 = bitcast float %606 to i32
-  %608 = icmp ne i32 %607, 0
-  br i1 %608, label %ENDIF209, label %ELSE214
+  %tmp597 = fcmp olt float %tmp558, %tmp81
+  %tmp598 = sext i1 %tmp597 to i32
+  %tmp599 = bitcast i32 %tmp598 to float
+  %tmp600 = bitcast float %tmp599 to i32
+  %tmp601 = icmp ne i32 %tmp600, 0
+  br i1 %tmp601, label %ENDIF209, label %ELSE214
 
 ENDIF209:                                         ; preds = %ELSE214, %ELSE211, %ENDIF200
-  %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
-  %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
-  %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
-  %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
-  %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
-  %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
-  %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
-  %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
-  %609 = fmul float %164, %85
-  %610 = fmul float %165, %86
-  %611 = fadd float %609, %610
-  %612 = fmul float %166, %87
-  %613 = fadd float %611, %612
-  %614 = fmul float %167, %88
-  %615 = fadd float %613, %614
-  %616 = fmul float %164, %89
-  %617 = fmul float %165, %90
-  %618 = fadd float %616, %617
-  %619 = fmul float %166, %91
-  %620 = fadd float %618, %619
-  %621 = fmul float %167, %92
-  %622 = fadd float %620, %621
-  %623 = fmul float %164, %93
-  %624 = fmul float %165, %94
-  %625 = fadd float %623, %624
-  %626 = fmul float %166, %95
-  %627 = fadd float %625, %626
-  %628 = fmul float %167, %96
-  %629 = fadd float %627, %628
-  %630 = fsub float -0.000000e+00, %78
-  %631 = fadd float 1.000000e+00, %630
-  %632 = call float @fabs(float %615)
-  %633 = call float @fabs(float %622)
-  %634 = fcmp oge float %631, %632
-  %635 = sext i1 %634 to i32
-  %636 = bitcast i32 %635 to float
-  %637 = bitcast float %636 to i32
-  %638 = and i32 %637, 1065353216
-  %639 = bitcast i32 %638 to float
-  %640 = fcmp oge float %631, %633
-  %641 = sext i1 %640 to i32
-  %642 = bitcast i32 %641 to float
-  %643 = bitcast float %642 to i32
-  %644 = and i32 %643, 1065353216
-  %645 = bitcast i32 %644 to float
-  %646 = fmul float %639, %645
-  %647 = fmul float %629, %646
-  %648 = fmul float %615, %temp68.0
-  %649 = fadd float %648, %temp70.0
-  %650 = fmul float %622, %temp69.0
-  %651 = fadd float %650, %temp71.0
-  %652 = fmul float %615, %temp52.0
-  %653 = fadd float %652, %temp54.0
-  %654 = fmul float %622, %temp53.0
-  %655 = fadd float %654, %temp55.0
-  %656 = fadd float %temp80.0, -1.000000e+00
-  %657 = fmul float %656, %77
-  %658 = fadd float %657, 1.000000e+00
-  %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
-  %660 = bitcast float %649 to i32
-  %661 = bitcast float %651 to i32
-  %662 = bitcast float 0.000000e+00 to i32
-  %663 = insertelement <4 x i32> undef, i32 %660, i32 0
-  %664 = insertelement <4 x i32> %663, i32 %661, i32 1
-  %665 = insertelement <4 x i32> %664, i32 %662, i32 2
-  %666 = insertelement <4 x i32> %665, i32 undef, i32 3
-  %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
-  %668 = extractelement <4 x float> %667, i32 0
-  %669 = extractelement <4 x float> %667, i32 1
-  %670 = bitcast float %653 to i32
-  %671 = bitcast float %655 to i32
-  %672 = bitcast float 0.000000e+00 to i32
-  %673 = insertelement <4 x i32> undef, i32 %670, i32 0
-  %674 = insertelement <4 x i32> %673, i32 %671, i32 1
-  %675 = insertelement <4 x i32> %674, i32 %672, i32 2
-  %676 = insertelement <4 x i32> %675, i32 undef, i32 3
-  %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
-  %678 = extractelement <4 x float> %677, i32 0
-  %679 = extractelement <4 x float> %677, i32 1
-  %680 = fsub float -0.000000e+00, %669
-  %681 = fadd float 1.000000e+00, %680
-  %682 = fsub float -0.000000e+00, %679
-  %683 = fadd float 1.000000e+00, %682
-  %684 = fmul float %681, 2.500000e-01
-  %685 = fmul float %683, 2.500000e-01
-  %686 = fsub float -0.000000e+00, %684
-  %687 = fadd float %668, %686
-  %688 = fsub float -0.000000e+00, %685
-  %689 = fadd float %678, %688
-  %690 = fmul float %647, %temp88.0
-  %691 = fadd float %690, %temp89.0
-  %692 = fmul float %647, %temp90.0
-  %693 = fadd float %692, %temp91.0
-  %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
-  %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
-  %696 = fsub float -0.000000e+00, %694
-  %697 = fadd float %668, %696
-  %698 = fsub float -0.000000e+00, %695
-  %699 = fadd float %678, %698
-  %700 = fmul float %668, %668
-  %701 = fmul float %678, %678
-  %702 = fsub float -0.000000e+00, %700
-  %703 = fadd float %687, %702
-  %704 = fsub float -0.000000e+00, %701
-  %705 = fadd float %689, %704
-  %706 = fcmp uge float %703, %75
-  %707 = select i1 %706, float %703, float %75
-  %708 = fcmp uge float %705, %75
-  %709 = select i1 %708, float %705, float %75
-  %710 = fmul float %697, %697
-  %711 = fadd float %710, %707
-  %712 = fmul float %699, %699
-  %713 = fadd float %712, %709
-  %714 = fdiv float 1.000000e+00, %711
-  %715 = fdiv float 1.000000e+00, %713
-  %716 = fmul float %707, %714
-  %717 = fmul float %709, %715
-  %718 = fcmp oge float %697, 0.000000e+00
-  %719 = sext i1 %718 to i32
-  %720 = bitcast i32 %719 to float
-  %721 = bitcast float %720 to i32
-  %722 = icmp ne i32 %721, 0
-  %.229 = select i1 %722, float 1.000000e+00, float %716
-  %723 = fcmp oge float %699, 0.000000e+00
-  %724 = sext i1 %723 to i32
-  %725 = bitcast i32 %724 to float
-  %726 = bitcast float %725 to i32
-  %727 = icmp ne i32 %726, 0
-  %temp28.0 = select i1 %727, float 1.000000e+00, float %717
-  %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
-  %729 = call float @llvm.pow.f32(float %728, float %76)
-  %730 = fmul float %729, %79
-  %731 = fadd float %730, %80
-  %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
-  %733 = fmul float %732, %732
-  %734 = fmul float 2.000000e+00, %732
-  %735 = fsub float -0.000000e+00, %734
-  %736 = fadd float 3.000000e+00, %735
-  %737 = fmul float %733, %736
-  %738 = fmul float %548, %737
-  %739 = fmul float %549, %737
-  %740 = fmul float %550, %737
-  %741 = fmul float %738, %515
-  %742 = fadd float %741, %533
-  %743 = fmul float %739, %515
-  %744 = fadd float %743, %534
-  %745 = fmul float %740, %515
-  %746 = fadd float %745, %535
-  %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
-  %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
-  %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
-  %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
-  %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
-  %752 = fmul float %748, %751
-  %753 = fmul float %749, %751
-  %754 = fmul float %750, %751
-  %755 = fmul float %742, %752
-  %756 = fmul float %744, %753
-  %757 = fmul float %746, %754
-  %758 = fmul float %temp12.0, %216
-  %759 = fmul float %temp13.0, %217
-  %760 = fadd float %759, %758
-  %761 = fmul float %temp14.0, %218
-  %762 = fadd float %760, %761
-  %763 = call float @fabs(float %762)
-  %764 = fmul float %763, %763
-  %765 = fmul float %764, %50
-  %766 = fadd float %765, %51
-  %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
-  %768 = fsub float -0.000000e+00, %767
-  %769 = fadd float 1.000000e+00, %768
-  %770 = fmul float %33, %769
-  %771 = fmul float %33, %769
-  %772 = fmul float %33, %769
-  %773 = fmul float %34, %769
-  %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
-  %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
-  %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
-  %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
-  %778 = fcmp uge float %774, 0x3E6FFFFE60000000
-  %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
-  %780 = fcmp uge float %775, 0x3E6FFFFE60000000
-  %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
-  %782 = fcmp uge float %776, 0x3E6FFFFE60000000
-  %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
-  %784 = fcmp uge float %779, 6.550400e+04
-  %785 = select i1 %784, float 6.550400e+04, float %779
-  %786 = fcmp uge float %781, 6.550400e+04
-  %787 = select i1 %786, float 6.550400e+04, float %781
-  %788 = fcmp uge float %783, 6.550400e+04
-  %789 = select i1 %788, float 6.550400e+04, float %783
-  %790 = fmul float %777, %52
-  %791 = fadd float %790, %53
-  %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
-  %793 = call i32 @llvm.SI.packf16(float %785, float %787)
-  %794 = bitcast i32 %793 to float
-  %795 = call i32 @llvm.SI.packf16(float %789, float %792)
-  %796 = bitcast i32 %795 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
+  %temp52.0 = phi float [ %tmp107, %ENDIF200 ], [ %tmp99, %ELSE214 ], [ %tmp103, %ELSE211 ]
+  %temp53.0 = phi float [ %tmp108, %ENDIF200 ], [ %tmp100, %ELSE214 ], [ %tmp104, %ELSE211 ]
+  %temp54.0 = phi float [ %tmp109, %ENDIF200 ], [ %tmp101, %ELSE214 ], [ %tmp105, %ELSE211 ]
+  %temp55.0 = phi float [ %tmp110, %ENDIF200 ], [ %tmp102, %ELSE214 ], [ %tmp106, %ELSE211 ]
+  %temp68.0 = phi float [ %tmp111, %ENDIF200 ], [ %.230, %ELSE214 ], [ %tmp107, %ELSE211 ]
+  %temp69.0 = phi float [ %tmp112, %ENDIF200 ], [ %.231, %ELSE214 ], [ %tmp108, %ELSE211 ]
+  %temp70.0 = phi float [ %tmp113, %ENDIF200 ], [ %.232, %ELSE214 ], [ %tmp109, %ELSE211 ]
+  %temp71.0 = phi float [ %tmp114, %ENDIF200 ], [ %.233, %ELSE214 ], [ %tmp110, %ELSE211 ]
+  %tmp602 = fmul float %tmp163, %tmp84
+  %tmp603 = fmul float %tmp164, %tmp85
+  %tmp604 = fadd float %tmp602, %tmp603
+  %tmp605 = fmul float %tmp165, %tmp86
+  %tmp606 = fadd float %tmp604, %tmp605
+  %tmp607 = fmul float %tmp166, %tmp87
+  %tmp608 = fadd float %tmp606, %tmp607
+  %tmp609 = fmul float %tmp163, %tmp88
+  %tmp610 = fmul float %tmp164, %tmp89
+  %tmp611 = fadd float %tmp609, %tmp610
+  %tmp612 = fmul float %tmp165, %tmp90
+  %tmp613 = fadd float %tmp611, %tmp612
+  %tmp614 = fmul float %tmp166, %tmp91
+  %tmp615 = fadd float %tmp613, %tmp614
+  %tmp616 = fmul float %tmp163, %tmp92
+  %tmp617 = fmul float %tmp164, %tmp93
+  %tmp618 = fadd float %tmp616, %tmp617
+  %tmp619 = fmul float %tmp165, %tmp94
+  %tmp620 = fadd float %tmp618, %tmp619
+  %tmp621 = fmul float %tmp166, %tmp95
+  %tmp622 = fadd float %tmp620, %tmp621
+  %tmp623 = fsub float -0.000000e+00, %tmp77
+  %tmp624 = fadd float 1.000000e+00, %tmp623
+  %tmp625 = call float @fabs(float %tmp608)
+  %tmp626 = call float @fabs(float %tmp615)
+  %tmp627 = fcmp oge float %tmp624, %tmp625
+  %tmp628 = sext i1 %tmp627 to i32
+  %tmp629 = bitcast i32 %tmp628 to float
+  %tmp630 = bitcast float %tmp629 to i32
+  %tmp631 = and i32 %tmp630, 1065353216
+  %tmp632 = bitcast i32 %tmp631 to float
+  %tmp633 = fcmp oge float %tmp624, %tmp626
+  %tmp634 = sext i1 %tmp633 to i32
+  %tmp635 = bitcast i32 %tmp634 to float
+  %tmp636 = bitcast float %tmp635 to i32
+  %tmp637 = and i32 %tmp636, 1065353216
+  %tmp638 = bitcast i32 %tmp637 to float
+  %tmp639 = fmul float %tmp632, %tmp638
+  %tmp640 = fmul float %tmp622, %tmp639
+  %tmp641 = fmul float %tmp608, %temp68.0
+  %tmp642 = fadd float %tmp641, %temp70.0
+  %tmp643 = fmul float %tmp615, %temp69.0
+  %tmp644 = fadd float %tmp643, %temp71.0
+  %tmp645 = fmul float %tmp608, %temp52.0
+  %tmp646 = fadd float %tmp645, %temp54.0
+  %tmp647 = fmul float %tmp615, %temp53.0
+  %tmp648 = fadd float %tmp647, %temp55.0
+  %tmp649 = fadd float %temp80.0, -1.000000e+00
+  %tmp650 = fmul float %tmp649, %tmp76
+  %tmp651 = fadd float %tmp650, 1.000000e+00
+  %tmp652 = call float @llvm.AMDGPU.clamp.f32(float %tmp651, float 0.000000e+00, float 1.000000e+00)
+  %tmp653 = bitcast float %tmp642 to i32
+  %tmp654 = bitcast float %tmp644 to i32
+  %tmp655 = bitcast float 0.000000e+00 to i32
+  %tmp656 = insertelement <4 x i32> undef, i32 %tmp653, i32 0
+  %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1
+  %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2
+  %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3
+  %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32>
+  %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp661 = extractelement <4 x float> %tmp660, i32 0
+  %tmp662 = extractelement <4 x float> %tmp660, i32 1
+  %tmp663 = bitcast float %tmp646 to i32
+  %tmp664 = bitcast float %tmp648 to i32
+  %tmp665 = bitcast float 0.000000e+00 to i32
+  %tmp666 = insertelement <4 x i32> undef, i32 %tmp663, i32 0
+  %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1
+  %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2
+  %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3
+  %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp671 = extractelement <4 x float> %tmp670, i32 0
+  %tmp672 = extractelement <4 x float> %tmp670, i32 1
+  %tmp673 = fsub float -0.000000e+00, %tmp662
+  %tmp674 = fadd float 1.000000e+00, %tmp673
+  %tmp675 = fsub float -0.000000e+00, %tmp672
+  %tmp676 = fadd float 1.000000e+00, %tmp675
+  %tmp677 = fmul float %tmp674, 2.500000e-01
+  %tmp678 = fmul float %tmp676, 2.500000e-01
+  %tmp679 = fsub float -0.000000e+00, %tmp677
+  %tmp680 = fadd float %tmp661, %tmp679
+  %tmp681 = fsub float -0.000000e+00, %tmp678
+  %tmp682 = fadd float %tmp671, %tmp681
+  %tmp683 = fmul float %tmp640, %temp88.0
+  %tmp684 = fadd float %tmp683, %temp89.0
+  %tmp685 = fmul float %tmp640, %temp90.0
+  %tmp686 = fadd float %tmp685, %temp91.0
+  %tmp687 = call float @llvm.AMDGPU.clamp.f32(float %tmp684, float 0.000000e+00, float 1.000000e+00)
+  %tmp688 = call float @llvm.AMDGPU.clamp.f32(float %tmp686, float 0.000000e+00, float 1.000000e+00)
+  %tmp689 = fsub float -0.000000e+00, %tmp687
+  %tmp690 = fadd float %tmp661, %tmp689
+  %tmp691 = fsub float -0.000000e+00, %tmp688
+  %tmp692 = fadd float %tmp671, %tmp691
+  %tmp693 = fmul float %tmp661, %tmp661
+  %tmp694 = fmul float %tmp671, %tmp671
+  %tmp695 = fsub float -0.000000e+00, %tmp693
+  %tmp696 = fadd float %tmp680, %tmp695
+  %tmp697 = fsub float -0.000000e+00, %tmp694
+  %tmp698 = fadd float %tmp682, %tmp697
+  %tmp699 = fcmp uge float %tmp696, %tmp74
+  %tmp700 = select i1 %tmp699, float %tmp696, float %tmp74
+  %tmp701 = fcmp uge float %tmp698, %tmp74
+  %tmp702 = select i1 %tmp701, float %tmp698, float %tmp74
+  %tmp703 = fmul float %tmp690, %tmp690
+  %tmp704 = fadd float %tmp703, %tmp700
+  %tmp705 = fmul float %tmp692, %tmp692
+  %tmp706 = fadd float %tmp705, %tmp702
+  %tmp707 = fdiv float 1.000000e+00, %tmp704
+  %tmp708 = fdiv float 1.000000e+00, %tmp706
+  %tmp709 = fmul float %tmp700, %tmp707
+  %tmp710 = fmul float %tmp702, %tmp708
+  %tmp711 = fcmp oge float %tmp690, 0.000000e+00
+  %tmp712 = sext i1 %tmp711 to i32
+  %tmp713 = bitcast i32 %tmp712 to float
+  %tmp714 = bitcast float %tmp713 to i32
+  %tmp715 = icmp ne i32 %tmp714, 0
+  %.229 = select i1 %tmp715, float 1.000000e+00, float %tmp709
+  %tmp716 = fcmp oge float %tmp692, 0.000000e+00
+  %tmp717 = sext i1 %tmp716 to i32
+  %tmp718 = bitcast i32 %tmp717 to float
+  %tmp719 = bitcast float %tmp718 to i32
+  %tmp720 = icmp ne i32 %tmp719, 0
+  %temp28.0 = select i1 %tmp720, float 1.000000e+00, float %tmp710
+  %one.sub.a.i25 = fsub float 1.000000e+00, %tmp652
+  %one.sub.ac.i26 = fmul float %one.sub.a.i25, %.229
+  %mul.i27 = fmul float %temp28.0, %.229
+  %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
+  %tmp721 = call float @llvm.pow.f32(float %result.i28, float %tmp75)
+  %tmp722 = fmul float %tmp721, %tmp78
+  %tmp723 = fadd float %tmp722, %tmp79
+  %tmp724 = call float @llvm.AMDGPU.clamp.f32(float %tmp723, float 0.000000e+00, float 1.000000e+00)
+  %tmp725 = fmul float %tmp724, %tmp724
+  %tmp726 = fmul float 2.000000e+00, %tmp724
+  %tmp727 = fsub float -0.000000e+00, %tmp726
+  %tmp728 = fadd float 3.000000e+00, %tmp727
+  %tmp729 = fmul float %tmp725, %tmp728
+  %tmp730 = fmul float %tmp541, %tmp729
+  %tmp731 = fmul float %tmp542, %tmp729
+  %tmp732 = fmul float %tmp543, %tmp729
+  %tmp733 = fmul float %tmp730, %tmp508
+  %tmp734 = fadd float %tmp733, %tmp526
+  %tmp735 = fmul float %tmp731, %tmp508
+  %tmp736 = fadd float %tmp735, %tmp527
+  %tmp737 = fmul float %tmp732, %tmp508
+  %tmp738 = fadd float %tmp737, %tmp528
+  %one.sub.a.i23 = fsub float 1.000000e+00, %tmp229
+  %result.i24 = fadd float %tmp283, %one.sub.a.i23
+  %one.sub.a.i21 = fsub float 1.000000e+00, %tmp36
+  %result.i22 = fadd float %tmp294, %one.sub.a.i21
+  %one.sub.a.i19 = fsub float 1.000000e+00, %tmp36
+  %result.i20 = fadd float %tmp295, %one.sub.a.i19
+  %one.sub.a.i17 = fsub float 1.000000e+00, %tmp36
+  %result.i18 = fadd float %tmp296, %one.sub.a.i17
+  %one.sub.a.i15 = fsub float 1.000000e+00, %tmp37
+  %result.i16 = fadd float %result.i24, %one.sub.a.i15
+  %tmp739 = fmul float %result.i22, %result.i16
+  %tmp740 = fmul float %result.i20, %result.i16
+  %tmp741 = fmul float %result.i18, %result.i16
+  %tmp742 = fmul float %tmp734, %tmp739
+  %tmp743 = fmul float %tmp736, %tmp740
+  %tmp744 = fmul float %tmp738, %tmp741
+  %tmp745 = fmul float %temp12.0, %tmp215
+  %tmp746 = fmul float %temp13.0, %tmp216
+  %tmp747 = fadd float %tmp746, %tmp745
+  %tmp748 = fmul float %temp14.0, %tmp217
+  %tmp749 = fadd float %tmp747, %tmp748
+  %tmp750 = call float @fabs(float %tmp749)
+  %tmp751 = fmul float %tmp750, %tmp750
+  %tmp752 = fmul float %tmp751, %tmp49
+  %tmp753 = fadd float %tmp752, %tmp50
+  %tmp754 = call float @llvm.AMDGPU.clamp.f32(float %tmp753, float 0.000000e+00, float 1.000000e+00)
+  %tmp755 = fsub float -0.000000e+00, %tmp754
+  %tmp756 = fadd float 1.000000e+00, %tmp755
+  %tmp757 = fmul float %tmp32, %tmp756
+  %tmp758 = fmul float %tmp32, %tmp756
+  %tmp759 = fmul float %tmp32, %tmp756
+  %tmp760 = fmul float %tmp33, %tmp756
+  %one.sub.a.i11 = fsub float 1.000000e+00, %tmp757
+  %one.sub.ac.i12 = fmul float %one.sub.a.i11, %tmp742
+  %mul.i13 = fmul float %tmp30, %tmp742
+  %result.i14 = fadd float %mul.i13, %one.sub.ac.i12
+  %one.sub.a.i7 = fsub float 1.000000e+00, %tmp758
+  %one.sub.ac.i8 = fmul float %one.sub.a.i7, %tmp743
+  %mul.i9 = fmul float %tmp30, %tmp743
+  %result.i10 = fadd float %mul.i9, %one.sub.ac.i8
+  %one.sub.a.i3 = fsub float 1.000000e+00, %tmp759
+  %one.sub.ac.i4 = fmul float %one.sub.a.i3, %tmp744
+  %mul.i5 = fmul float %tmp30, %tmp744
+  %result.i6 = fadd float %mul.i5, %one.sub.ac.i4
+  %one.sub.a.i1 = fsub float 1.000000e+00, %tmp760
+  %one.sub.ac.i = fmul float %one.sub.a.i1, %tmp367
+  %mul.i = fmul float %tmp31, %tmp367
+  %result.i2 = fadd float %mul.i, %one.sub.ac.i
+  %tmp761 = fcmp uge float %result.i14, 0x3E6FFFFE60000000
+  %tmp762 = select i1 %tmp761, float %result.i14, float 0x3E6FFFFE60000000
+  %tmp763 = fcmp uge float %result.i10, 0x3E6FFFFE60000000
+  %tmp764 = select i1 %tmp763, float %result.i10, float 0x3E6FFFFE60000000
+  %tmp765 = fcmp uge float %result.i6, 0x3E6FFFFE60000000
+  %tmp766 = select i1 %tmp765, float %result.i6, float 0x3E6FFFFE60000000
+  %tmp767 = fcmp uge float %tmp762, 6.550400e+04
+  %tmp768 = select i1 %tmp767, float 6.550400e+04, float %tmp762
+  %tmp769 = fcmp uge float %tmp764, 6.550400e+04
+  %tmp770 = select i1 %tmp769, float 6.550400e+04, float %tmp764
+  %tmp771 = fcmp uge float %tmp766, 6.550400e+04
+  %tmp772 = select i1 %tmp771, float 6.550400e+04, float %tmp766
+  %tmp773 = fmul float %result.i2, %tmp51
+  %tmp774 = fadd float %tmp773, %tmp52
+  %tmp775 = call float @llvm.AMDGPU.clamp.f32(float %tmp774, float 0.000000e+00, float 1.000000e+00)
+  %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
+  %tmp777 = bitcast i32 %tmp776 to float
+  %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %tmp775)
+  %tmp779 = bitcast i32 %tmp778 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779)
   ret void
 
 ELSE214:                                          ; preds = %ELSE211
-  %797 = fcmp olt float %565, %81
-  %798 = sext i1 %797 to i32
-  %799 = bitcast i32 %798 to float
-  %800 = bitcast float %799 to i32
-  %801 = icmp ne i32 %800, 0
-  %.230 = select i1 %801, float %104, float %100
-  %.231 = select i1 %801, float %105, float %101
-  %.232 = select i1 %801, float %106, float %102
-  %.233 = select i1 %801, float %107, float %103
+  %tmp780 = fcmp olt float %tmp558, %tmp80
+  %tmp781 = sext i1 %tmp780 to i32
+  %tmp782 = bitcast i32 %tmp781 to float
+  %tmp783 = bitcast float %tmp782 to i32
+  %tmp784 = icmp ne i32 %tmp783, 0
+  %.230 = select i1 %tmp784, float %tmp103, float %tmp99
+  %.231 = select i1 %tmp784, float %tmp104, float %tmp100
+  %.232 = select i1 %tmp784, float %tmp105, float %tmp101
+  %.233 = select i1 %tmp784, float %tmp106, float %tmp102
   br label %ENDIF209
 }
 
 ; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #2
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
 
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+
+
+declare float @llvm.exp2.f32(float) #2
 
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #2
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @ceil(float) #3
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.rsq.f32(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.cndlt(float, float, float) #2
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #2
+declare float @fabs(float) #1
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+
+
+; Function Attrs: nounwind readnone
+declare float @llvm.pow.f32(float, float) #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #1 = { readnone }
+attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind readonly }
-attributes #4 = { readonly }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 4b2d8ec6bf0a..30aa2d550f65 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -3,10 +3,10 @@
 
 ; If this occurs it is likely due to reordering and the restore was
 ; originally supposed to happen before SI_END_CF.
+
 ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
 ; SI-NOT: v_readlane_b32 [[SAVED]]
-
-define void @main() #0 {
+define amdgpu_ps void @main() #0 {
 main_body:
   %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
   %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
@@ -80,184 +80,198 @@ main_body:
 LOOP:                                             ; preds = %ENDIF2795, %main_body
   %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
   %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
-  %67 = icmp sgt i32 undef, 4
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %67 = icmp sgt i32 %tid, 4
   br i1 %67, label %ENDLOOP, label %ENDIF
 
 ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
-  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+  %one.sub.a.i = fsub float 1.000000e+00, %0
+  %one.sub.ac.i = fmul float %one.sub.a.i, undef
+  %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00)
   ret void
 
 ENDIF:                                            ; preds = %LOOP
-  %69 = fsub float %2, undef
-  %70 = fsub float %3, undef
-  %71 = fsub float %4, undef
-  %72 = fmul float %69, 0.000000e+00
+  %68 = fsub float %2, undef
+  %69 = fsub float %3, undef
+  %70 = fsub float %4, undef
+  %71 = fmul float %68, 0.000000e+00
+  %72 = fmul float %69, undef
   %73 = fmul float %70, undef
-  %74 = fmul float %71, undef
-  %75 = fsub float %6, undef
-  %76 = fsub float %7, undef
-  %77 = fmul float %75, undef
-  %78 = fmul float %76, 0.000000e+00
-  %79 = call float @llvm.minnum.f32(float %74, float %78)
-  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
-  %81 = call float @llvm.maxnum.f32(float %73, float %77)
-  %82 = call float @llvm.maxnum.f32(float undef, float %79)
-  %83 = call float @llvm.minnum.f32(float %80, float %81)
-  %84 = call float @llvm.minnum.f32(float %83, float undef)
-  %85 = fsub float %14, undef
-  %86 = fsub float %15, undef
-  %87 = fsub float %16, undef
+  %74 = fsub float %6, undef
+  %75 = fsub float %7, undef
+  %76 = fmul float %74, undef
+  %77 = fmul float %75, 0.000000e+00
+  %78 = call float @llvm.minnum.f32(float %73, float %77)
+  %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00)
+  %80 = call float @llvm.maxnum.f32(float %72, float %76)
+  %81 = call float @llvm.maxnum.f32(float undef, float %78)
+  %82 = call float @llvm.minnum.f32(float %79, float %80)
+  %83 = call float @llvm.minnum.f32(float %82, float undef)
+  %84 = fsub float %14, undef
+  %85 = fsub float %15, undef
+  %86 = fsub float %16, undef
+  %87 = fmul float %84, undef
   %88 = fmul float %85, undef
   %89 = fmul float %86, undef
-  %90 = fmul float %87, undef
-  %91 = fsub float %17, undef
-  %92 = fsub float %18, undef
-  %93 = fsub float %19, undef
-  %94 = fmul float %91, 0.000000e+00
+  %90 = fsub float %17, undef
+  %91 = fsub float %18, undef
+  %92 = fsub float %19, undef
+  %93 = fmul float %90, 0.000000e+00
+  %94 = fmul float %91, undef
   %95 = fmul float %92, undef
-  %96 = fmul float %93, undef
-  %97 = call float @llvm.minnum.f32(float %89, float %95)
-  %98 = call float @llvm.maxnum.f32(float %88, float %94)
-  %99 = call float @llvm.maxnum.f32(float %90, float %96)
-  %100 = call float @llvm.maxnum.f32(float undef, float %97)
-  %101 = call float @llvm.maxnum.f32(float %100, float undef)
-  %102 = call float @llvm.minnum.f32(float %98, float undef)
-  %103 = call float @llvm.minnum.f32(float %102, float %99)
-  %104 = fsub float %30, undef
-  %105 = fsub float %31, undef
+  %96 = call float @llvm.minnum.f32(float %88, float %94)
+  %97 = call float @llvm.maxnum.f32(float %87, float %93)
+  %98 = call float @llvm.maxnum.f32(float %89, float %95)
+  %99 = call float @llvm.maxnum.f32(float undef, float %96)
+  %100 = call float @llvm.maxnum.f32(float %99, float undef)
+  %101 = call float @llvm.minnum.f32(float %97, float undef)
+  %102 = call float @llvm.minnum.f32(float %101, float %98)
+  %103 = fsub float %30, undef
+  %104 = fsub float %31, undef
+  %105 = fmul float %103, 0.000000e+00
   %106 = fmul float %104, 0.000000e+00
-  %107 = fmul float %105, 0.000000e+00
-  %108 = call float @llvm.minnum.f32(float undef, float %106)
+  %107 = call float @llvm.minnum.f32(float undef, float %105)
+  %108 = call float @llvm.maxnum.f32(float undef, float %106)
   %109 = call float @llvm.maxnum.f32(float undef, float %107)
-  %110 = call float @llvm.maxnum.f32(float undef, float %108)
-  %111 = call float @llvm.maxnum.f32(float %110, float undef)
-  %112 = call float @llvm.minnum.f32(float undef, float %109)
-  %113 = fsub float %32, undef
-  %114 = fsub float %33, undef
-  %115 = fsub float %34, undef
-  %116 = fmul float %113, 0.000000e+00
+  %110 = call float @llvm.maxnum.f32(float %109, float undef)
+  %111 = call float @llvm.minnum.f32(float undef, float %108)
+  %112 = fsub float %32, undef
+  %113 = fsub float %33, undef
+  %114 = fsub float %34, undef
+  %115 = fmul float %112, 0.000000e+00
+  %116 = fmul float %113, undef
   %117 = fmul float %114, undef
-  %118 = fmul float %115, undef
-  %119 = fsub float %35, undef
-  %120 = fsub float %36, undef
-  %121 = fsub float %37, undef
+  %118 = fsub float %35, undef
+  %119 = fsub float %36, undef
+  %120 = fsub float %37, undef
+  %121 = fmul float %118, undef
   %122 = fmul float %119, undef
   %123 = fmul float %120, undef
-  %124 = fmul float %121, undef
+  %124 = call float @llvm.minnum.f32(float %115, float %121)
   %125 = call float @llvm.minnum.f32(float %116, float %122)
   %126 = call float @llvm.minnum.f32(float %117, float %123)
-  %127 = call float @llvm.minnum.f32(float %118, float %124)
-  %128 = call float @llvm.maxnum.f32(float %125, float %126)
-  %129 = call float @llvm.maxnum.f32(float %128, float %127)
-  %130 = fsub float %38, undef
-  %131 = fsub float %39, undef
-  %132 = fsub float %40, undef
-  %133 = fmul float %130, 0.000000e+00
+  %127 = call float @llvm.maxnum.f32(float %124, float %125)
+  %128 = call float @llvm.maxnum.f32(float %127, float %126)
+  %129 = fsub float %38, undef
+  %130 = fsub float %39, undef
+  %131 = fsub float %40, undef
+  %132 = fmul float %129, 0.000000e+00
+  %133 = fmul float %130, undef
   %134 = fmul float %131, undef
-  %135 = fmul float %132, undef
-  %136 = fsub float %41, undef
-  %137 = fsub float %42, undef
-  %138 = fsub float %43, undef
+  %135 = fsub float %41, undef
+  %136 = fsub float %42, undef
+  %137 = fsub float %43, undef
+  %138 = fmul float %135, undef
   %139 = fmul float %136, undef
   %140 = fmul float %137, undef
-  %141 = fmul float %138, undef
+  %141 = call float @llvm.minnum.f32(float %132, float %138)
   %142 = call float @llvm.minnum.f32(float %133, float %139)
   %143 = call float @llvm.minnum.f32(float %134, float %140)
-  %144 = call float @llvm.minnum.f32(float %135, float %141)
-  %145 = call float @llvm.maxnum.f32(float %142, float %143)
-  %146 = call float @llvm.maxnum.f32(float %145, float %144)
-  %147 = fsub float %44, undef
-  %148 = fsub float %45, undef
-  %149 = fsub float %46, undef
+  %144 = call float @llvm.maxnum.f32(float %141, float %142)
+  %145 = call float @llvm.maxnum.f32(float %144, float %143)
+  %146 = fsub float %44, undef
+  %147 = fsub float %45, undef
+  %148 = fsub float %46, undef
+  %149 = fmul float %146, 0.000000e+00
   %150 = fmul float %147, 0.000000e+00
-  %151 = fmul float %148, 0.000000e+00
-  %152 = fmul float %149, undef
-  %153 = fsub float %47, undef
-  %154 = fsub float %48, undef
-  %155 = fsub float %49, undef
-  %156 = fmul float %153, undef
-  %157 = fmul float %154, 0.000000e+00
-  %158 = fmul float %155, undef
+  %151 = fmul float %148, undef
+  %152 = fsub float %47, undef
+  %153 = fsub float %48, undef
+  %154 = fsub float %49, undef
+  %155 = fmul float %152, undef
+  %156 = fmul float %153, 0.000000e+00
+  %157 = fmul float %154, undef
+  %158 = call float @llvm.minnum.f32(float %149, float %155)
   %159 = call float @llvm.minnum.f32(float %150, float %156)
   %160 = call float @llvm.minnum.f32(float %151, float %157)
-  %161 = call float @llvm.minnum.f32(float %152, float %158)
-  %162 = call float @llvm.maxnum.f32(float %159, float %160)
-  %163 = call float @llvm.maxnum.f32(float %162, float %161)
-  %164 = fsub float %50, undef
-  %165 = fsub float %51, undef
-  %166 = fsub float %52, undef
-  %167 = fmul float %164, undef
+  %161 = call float @llvm.maxnum.f32(float %158, float %159)
+  %162 = call float @llvm.maxnum.f32(float %161, float %160)
+  %163 = fsub float %50, undef
+  %164 = fsub float %51, undef
+  %165 = fsub float %52, undef
+  %166 = fmul float %163, undef
+  %167 = fmul float %164, 0.000000e+00
   %168 = fmul float %165, 0.000000e+00
-  %169 = fmul float %166, 0.000000e+00
-  %170 = fsub float %53, undef
-  %171 = fsub float %54, undef
-  %172 = fsub float %55, undef
-  %173 = fdiv float 1.000000e+00, %temp18.0
+  %169 = fsub float %53, undef
+  %170 = fsub float %54, undef
+  %171 = fsub float %55, undef
+  %172 = fdiv float 1.000000e+00, %temp18.0
+  %173 = fmul float %169, undef
   %174 = fmul float %170, undef
-  %175 = fmul float %171, undef
-  %176 = fmul float %172, %173
+  %175 = fmul float %171, %172
+  %176 = call float @llvm.minnum.f32(float %166, float %173)
   %177 = call float @llvm.minnum.f32(float %167, float %174)
   %178 = call float @llvm.minnum.f32(float %168, float %175)
-  %179 = call float @llvm.minnum.f32(float %169, float %176)
-  %180 = call float @llvm.maxnum.f32(float %177, float %178)
-  %181 = call float @llvm.maxnum.f32(float %180, float %179)
-  %182 = fsub float %62, undef
-  %183 = fsub float %63, undef
-  %184 = fsub float %64, undef
-  %185 = fmul float %182, 0.000000e+00
+  %179 = call float @llvm.maxnum.f32(float %176, float %177)
+  %180 = call float @llvm.maxnum.f32(float %179, float %178)
+  %181 = fsub float %62, undef
+  %182 = fsub float %63, undef
+  %183 = fsub float %64, undef
+  %184 = fmul float %181, 0.000000e+00
+  %185 = fmul float %182, undef
   %186 = fmul float %183, undef
-  %187 = fmul float %184, undef
-  %188 = fsub float %65, undef
-  %189 = fsub float %66, undef
+  %187 = fsub float %65, undef
+  %188 = fsub float %66, undef
+  %189 = fmul float %187, undef
   %190 = fmul float %188, undef
-  %191 = fmul float %189, undef
+  %191 = call float @llvm.maxnum.f32(float %184, float %189)
   %192 = call float @llvm.maxnum.f32(float %185, float %190)
-  %193 = call float @llvm.maxnum.f32(float %186, float %191)
-  %194 = call float @llvm.maxnum.f32(float %187, float undef)
-  %195 = call float @llvm.minnum.f32(float %192, float %193)
-  %196 = call float @llvm.minnum.f32(float %195, float %194)
-  %.temp292.7 = select i1 undef, float %163, float undef
-  %temp292.9 = select i1 false, float %181, float %.temp292.7
+  %193 = call float @llvm.maxnum.f32(float %186, float undef)
+  %194 = call float @llvm.minnum.f32(float %191, float %192)
+  %195 = call float @llvm.minnum.f32(float %194, float %193)
+  %.temp292.7 = select i1 undef, float %162, float undef
+  %temp292.9 = select i1 false, float %180, float %.temp292.7
   %.temp292.9 = select i1 undef, float undef, float %temp292.9
-  %197 = fcmp ogt float undef, 0.000000e+00
-  %198 = fcmp olt float undef, %196
-  %199 = and i1 %197, %198
-  %200 = fcmp olt float undef, %.temp292.9
-  %201 = and i1 %199, %200
-  %temp292.11 = select i1 %201, float undef, float %.temp292.9
-  br i1 undef, label %IF2565, label %ELSE2566
+  %196 = fcmp ogt float undef, 0.000000e+00
+  %197 = fcmp olt float undef, %195
+  %198 = and i1 %196, %197
+  %199 = fcmp olt float undef, %.temp292.9
+  %200 = and i1 %198, %199
+  %temp292.11 = select i1 %200, float undef, float %.temp292.9
+  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %cmp0 = icmp eq i32 %tid0, 0
+  br i1 %cmp0, label %IF2565, label %ELSE2566
 
 IF2565:                                           ; preds = %ENDIF
-  br i1 false, label %ENDIF2582, label %ELSE2584
+  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %cmp1 = icmp eq i32 %tid1, 0
+  br i1 %cmp1, label %ENDIF2582, label %ELSE2584
 
 ELSE2566:                                         ; preds = %ENDIF
-  %202 = fcmp oeq float %temp292.11, 1.000000e+04
-  br i1 %202, label %ENDLOOP, label %ELSE2593
+  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tidf = bitcast i32 %tid2 to float
+  %201 = fcmp oeq float %temp292.11, %tidf
+  br i1 %201, label %ENDLOOP, label %ELSE2593
 
 ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
   %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
-  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
-  %203 = fsub float %5, undef
-  %204 = fmul float %203, undef
-  %205 = call float @llvm.maxnum.f32(float undef, float %204)
+  %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %202 = fsub float %5, undef
+  %203 = fmul float %202, undef
+  %204 = call float @llvm.maxnum.f32(float undef, float %203)
+  %205 = call float @llvm.minnum.f32(float %204, float undef)
   %206 = call float @llvm.minnum.f32(float %205, float undef)
-  %207 = call float @llvm.minnum.f32(float %206, float undef)
-  %208 = fcmp ogt float undef, 0.000000e+00
-  %209 = fcmp olt float undef, 1.000000e+00
-  %210 = and i1 %208, %209
-  %211 = fcmp olt float undef, %207
-  %212 = and i1 %210, %211
-  br i1 %212, label %ENDIF2795, label %ELSE2797
+  %207 = fcmp ogt float undef, 0.000000e+00
+  %208 = fcmp olt float undef, 1.000000e+00
+  %209 = and i1 %207, %208
+  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tidf3 = bitcast i32 %tid3 to float
+  %210 = fcmp olt float %tidf3, %206
+  %211 = and i1 %209, %210
+  br i1 %211, label %ENDIF2795, label %ELSE2797
 
 ELSE2584:                                         ; preds = %IF2565
   br label %ENDIF2582
 
 ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
-  %213 = fadd float %1, undef
-  %214 = fadd float 0.000000e+00, %213
-  %215 = call float @llvm.AMDIL.fraction.(float %214)
-  br i1 undef, label %IF2589, label %ELSE2590
+  %212 = fadd float %1, undef
+  %213 = fadd float 0.000000e+00, %212
+  %floor = call float @llvm.floor.f32(float %213)
+  %214 = fsub float %213, %floor
+  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %cmp4 = icmp eq i32 %tid4, 0
+  br i1 %cmp4, label %IF2589, label %ELSE2590
 
 IF2589:                                           ; preds = %ENDIF2582
   br label %ENDIF2588
@@ -266,61 +280,61 @@ ELSE2590:                                         ; preds = %ENDIF2582
   br label %ENDIF2588
 
 ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
-  %216 = fsub float 1.000000e+00, %215
-  %217 = call float @llvm.sqrt.f32(float %216)
-  %218 = fmul float %217, undef
-  %219 = fadd float %218, undef
+  %215 = fsub float 1.000000e+00, %214
+  %216 = call float @llvm.sqrt.f32(float %215)
+  %217 = fmul float %216, undef
+  %218 = fadd float %217, undef
   br label %ENDIF2564
 
 ELSE2593:                                         ; preds = %ELSE2566
-  %220 = fcmp oeq float %temp292.11, %82
-  %221 = fcmp olt float %82, %84
-  %222 = and i1 %220, %221
-  br i1 %222, label %ENDIF2594, label %ELSE2596
+  %219 = fcmp oeq float %temp292.11, %81
+  %220 = fcmp olt float %81, %83
+  %221 = and i1 %219, %220
+  br i1 %221, label %ENDIF2594, label %ELSE2596
 
 ELSE2596:                                         ; preds = %ELSE2593
-  %223 = fcmp oeq float %temp292.11, %101
-  %224 = fcmp olt float %101, %103
-  %225 = and i1 %223, %224
-  br i1 %225, label %ENDIF2594, label %ELSE2632
+  %222 = fcmp oeq float %temp292.11, %100
+  %223 = fcmp olt float %100, %102
+  %224 = and i1 %222, %223
+  br i1 %224, label %ENDIF2594, label %ELSE2632
 
 ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
   %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
-  %226 = fmul float %temp894.2, undef
+  %225 = fmul float %temp894.2, undef
   br label %ENDIF2564
 
 ELSE2632:                                         ; preds = %ELSE2596
   br i1 undef, label %ENDIF2594, label %ELSE2650
 
 ELSE2650:                                         ; preds = %ELSE2632
-  %227 = fcmp oeq float %temp292.11, %111
-  %228 = fcmp olt float %111, %112
-  %229 = and i1 %227, %228
-  br i1 %229, label %IF2667, label %ELSE2668
+  %226 = fcmp oeq float %temp292.11, %110
+  %227 = fcmp olt float %110, %111
+  %228 = and i1 %226, %227
+  br i1 %228, label %IF2667, label %ELSE2668
 
 IF2667:                                           ; preds = %ELSE2650
   br i1 undef, label %ENDIF2594, label %ELSE2671
 
 ELSE2668:                                         ; preds = %ELSE2650
-  %230 = fcmp oeq float %temp292.11, %129
-  %231 = fcmp olt float %129, undef
-  %232 = and i1 %230, %231
-  br i1 %232, label %ENDIF2594, label %ELSE2686
+  %229 = fcmp oeq float %temp292.11, %128
+  %230 = fcmp olt float %128, undef
+  %231 = and i1 %229, %230
+  br i1 %231, label %ENDIF2594, label %ELSE2686
 
 ELSE2671:                                         ; preds = %IF2667
   br label %ENDIF2594
 
 ELSE2686:                                         ; preds = %ELSE2668
-  %233 = fcmp oeq float %temp292.11, %146
-  %234 = fcmp olt float %146, undef
-  %235 = and i1 %233, %234
-  br i1 %235, label %ENDIF2594, label %ELSE2704
+  %232 = fcmp oeq float %temp292.11, %145
+  %233 = fcmp olt float %145, undef
+  %234 = and i1 %232, %233
+  br i1 %234, label %ENDIF2594, label %ELSE2704
 
 ELSE2704:                                         ; preds = %ELSE2686
-  %236 = fcmp oeq float %temp292.11, %181
-  %237 = fcmp olt float %181, undef
-  %238 = and i1 %236, %237
-  br i1 %238, label %ENDIF2594, label %ELSE2740
+  %235 = fcmp oeq float %temp292.11, %180
+  %236 = fcmp olt float %180, undef
+  %237 = and i1 %235, %236
+  br i1 %237, label %ENDIF2594, label %ELSE2740
 
 ELSE2740:                                         ; preds = %ELSE2704
   br i1 undef, label %IF2757, label %ELSE2758
@@ -335,8 +349,8 @@ ELSE2761:                                         ; preds = %IF2757
   br label %ENDIF2594
 
 IF2775:                                           ; preds = %ELSE2758
-  %239 = fcmp olt float undef, undef
-  br i1 %239, label %ENDIF2594, label %ELSE2779
+  %238 = fcmp olt float undef, undef
+  br i1 %238, label %ENDIF2594, label %ELSE2779
 
 ELSE2779:                                         ; preds = %IF2775
   br i1 undef, label %ENDIF2594, label %ELSE2782
@@ -345,39 +359,39 @@ ELSE2782:                                         ; preds = %ELSE2779
   br i1 undef, label %ENDIF2594, label %ELSE2785
 
 ELSE2785:                                         ; preds = %ELSE2782
-  %240 = fcmp olt float undef, 0.000000e+00
-  br i1 %240, label %ENDIF2594, label %ELSE2788
+  %239 = fcmp olt float undef, 0.000000e+00
+  br i1 %239, label %ENDIF2594, label %ELSE2788
 
 ELSE2788:                                         ; preds = %ELSE2785
-  %241 = fcmp olt float 0.000000e+00, undef
-  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+  %240 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00
   br label %ENDIF2594
 
 ELSE2797:                                         ; preds = %ENDIF2564
-  %242 = fsub float %8, undef
-  %243 = fsub float %9, undef
-  %244 = fsub float %10, undef
+  %241 = fsub float %8, undef
+  %242 = fsub float %9, undef
+  %243 = fsub float %10, undef
+  %244 = fmul float %241, undef
   %245 = fmul float %242, undef
   %246 = fmul float %243, undef
-  %247 = fmul float %244, undef
-  %248 = fsub float %11, undef
-  %249 = fsub float %12, undef
-  %250 = fsub float %13, undef
+  %247 = fsub float %11, undef
+  %248 = fsub float %12, undef
+  %249 = fsub float %13, undef
+  %250 = fmul float %247, undef
   %251 = fmul float %248, undef
   %252 = fmul float %249, undef
-  %253 = fmul float %250, undef
+  %253 = call float @llvm.minnum.f32(float %244, float %250)
   %254 = call float @llvm.minnum.f32(float %245, float %251)
-  %255 = call float @llvm.minnum.f32(float %246, float %252)
-  %256 = call float @llvm.maxnum.f32(float %247, float %253)
-  %257 = call float @llvm.maxnum.f32(float %254, float %255)
-  %258 = call float @llvm.maxnum.f32(float %257, float undef)
-  %259 = call float @llvm.minnum.f32(float undef, float %256)
-  %260 = fcmp ogt float %258, 0.000000e+00
-  %261 = fcmp olt float %258, 1.000000e+00
-  %262 = and i1 %260, %261
-  %263 = fcmp olt float %258, %259
-  %264 = and i1 %262, %263
-  br i1 %264, label %ENDIF2795, label %ELSE2800
+  %255 = call float @llvm.maxnum.f32(float %246, float %252)
+  %256 = call float @llvm.maxnum.f32(float %253, float %254)
+  %257 = call float @llvm.maxnum.f32(float %256, float undef)
+  %258 = call float @llvm.minnum.f32(float undef, float %255)
+  %259 = fcmp ogt float %257, 0.000000e+00
+  %260 = fcmp olt float %257, 1.000000e+00
+  %261 = and i1 %259, %260
+  %262 = fcmp olt float %257, %258
+  %263 = and i1 %261, %262
+  br i1 %263, label %ENDIF2795, label %ELSE2800
 
 ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
   br label %LOOP
@@ -386,53 +400,53 @@ ELSE2800:                                         ; preds = %ELSE2797
   br i1 undef, label %ENDIF2795, label %ELSE2803
 
 ELSE2803:                                         ; preds = %ELSE2800
-  %265 = fsub float %20, undef
-  %266 = fsub float %21, undef
-  %267 = fsub float %22, undef
+  %264 = fsub float %20, undef
+  %265 = fsub float %21, undef
+  %266 = fsub float %22, undef
+  %267 = fmul float %264, undef
   %268 = fmul float %265, undef
-  %269 = fmul float %266, undef
-  %270 = fmul float %267, 0.000000e+00
-  %271 = fsub float %23, undef
-  %272 = fsub float %24, undef
-  %273 = fsub float %25, undef
+  %269 = fmul float %266, 0.000000e+00
+  %270 = fsub float %23, undef
+  %271 = fsub float %24, undef
+  %272 = fsub float %25, undef
+  %273 = fmul float %270, undef
   %274 = fmul float %271, undef
   %275 = fmul float %272, undef
-  %276 = fmul float %273, undef
-  %277 = call float @llvm.minnum.f32(float %268, float %274)
+  %276 = call float @llvm.minnum.f32(float %267, float %273)
+  %277 = call float @llvm.maxnum.f32(float %268, float %274)
   %278 = call float @llvm.maxnum.f32(float %269, float %275)
-  %279 = call float @llvm.maxnum.f32(float %270, float %276)
-  %280 = call float @llvm.maxnum.f32(float %277, float undef)
-  %281 = call float @llvm.maxnum.f32(float %280, float undef)
-  %282 = call float @llvm.minnum.f32(float undef, float %278)
-  %283 = call float @llvm.minnum.f32(float %282, float %279)
-  %284 = fcmp ogt float %281, 0.000000e+00
-  %285 = fcmp olt float %281, 1.000000e+00
-  %286 = and i1 %284, %285
-  %287 = fcmp olt float %281, %283
-  %288 = and i1 %286, %287
-  br i1 %288, label %ENDIF2795, label %ELSE2806
+  %279 = call float @llvm.maxnum.f32(float %276, float undef)
+  %280 = call float @llvm.maxnum.f32(float %279, float undef)
+  %281 = call float @llvm.minnum.f32(float undef, float %277)
+  %282 = call float @llvm.minnum.f32(float %281, float %278)
+  %283 = fcmp ogt float %280, 0.000000e+00
+  %284 = fcmp olt float %280, 1.000000e+00
+  %285 = and i1 %283, %284
+  %286 = fcmp olt float %280, %282
+  %287 = and i1 %285, %286
+  br i1 %287, label %ENDIF2795, label %ELSE2806
 
 ELSE2806:                                         ; preds = %ELSE2803
-  %289 = fsub float %26, undef
-  %290 = fsub float %27, undef
-  %291 = fsub float %28, undef
-  %292 = fmul float %289, undef
-  %293 = fmul float %290, 0.000000e+00
-  %294 = fmul float %291, undef
-  %295 = fsub float %29, undef
-  %296 = fmul float %295, undef
-  %297 = call float @llvm.minnum.f32(float %292, float %296)
-  %298 = call float @llvm.minnum.f32(float %293, float undef)
-  %299 = call float @llvm.maxnum.f32(float %294, float undef)
-  %300 = call float @llvm.maxnum.f32(float %297, float %298)
-  %301 = call float @llvm.maxnum.f32(float %300, float undef)
-  %302 = call float @llvm.minnum.f32(float undef, float %299)
-  %303 = fcmp ogt float %301, 0.000000e+00
-  %304 = fcmp olt float %301, 1.000000e+00
-  %305 = and i1 %303, %304
-  %306 = fcmp olt float %301, %302
-  %307 = and i1 %305, %306
-  br i1 %307, label %ENDIF2795, label %ELSE2809
+  %288 = fsub float %26, undef
+  %289 = fsub float %27, undef
+  %290 = fsub float %28, undef
+  %291 = fmul float %288, undef
+  %292 = fmul float %289, 0.000000e+00
+  %293 = fmul float %290, undef
+  %294 = fsub float %29, undef
+  %295 = fmul float %294, undef
+  %296 = call float @llvm.minnum.f32(float %291, float %295)
+  %297 = call float @llvm.minnum.f32(float %292, float undef)
+  %298 = call float @llvm.maxnum.f32(float %293, float undef)
+  %299 = call float @llvm.maxnum.f32(float %296, float %297)
+  %300 = call float @llvm.maxnum.f32(float %299, float undef)
+  %301 = call float @llvm.minnum.f32(float undef, float %298)
+  %302 = fcmp ogt float %300, 0.000000e+00
+  %303 = fcmp olt float %300, 1.000000e+00
+  %304 = and i1 %302, %303
+  %305 = fcmp olt float %300, %301
+  %306 = and i1 %304, %305
+  br i1 %306, label %ENDIF2795, label %ELSE2809
 
 ELSE2809:                                         ; preds = %ELSE2806
   br i1 undef, label %ENDIF2795, label %ELSE2812
@@ -447,40 +461,42 @@ ELSE2818:                                         ; preds = %ELSE2815
   br i1 undef, label %ENDIF2795, label %ELSE2821
 
 ELSE2821:                                         ; preds = %ELSE2818
-  %308 = fsub float %56, undef
-  %309 = fsub float %57, undef
-  %310 = fsub float %58, undef
-  %311 = fmul float %308, undef
-  %312 = fmul float %309, 0.000000e+00
-  %313 = fmul float %310, undef
-  %314 = fsub float %59, undef
-  %315 = fsub float %60, undef
-  %316 = fsub float %61, undef
+  %307 = fsub float %56, undef
+  %308 = fsub float %57, undef
+  %309 = fsub float %58, undef
+  %310 = fmul float %307, undef
+  %311 = fmul float %308, 0.000000e+00
+  %312 = fmul float %309, undef
+  %313 = fsub float %59, undef
+  %314 = fsub float %60, undef
+  %315 = fsub float %61, undef
+  %316 = fmul float %313, undef
   %317 = fmul float %314, undef
   %318 = fmul float %315, undef
-  %319 = fmul float %316, undef
+  %319 = call float @llvm.maxnum.f32(float %310, float %316)
   %320 = call float @llvm.maxnum.f32(float %311, float %317)
   %321 = call float @llvm.maxnum.f32(float %312, float %318)
-  %322 = call float @llvm.maxnum.f32(float %313, float %319)
-  %323 = call float @llvm.minnum.f32(float %320, float %321)
-  %324 = call float @llvm.minnum.f32(float %323, float %322)
-  %325 = fcmp ogt float undef, 0.000000e+00
-  %326 = fcmp olt float undef, 1.000000e+00
-  %327 = and i1 %325, %326
-  %328 = fcmp olt float undef, %324
-  %329 = and i1 %327, %328
-  br i1 %329, label %ENDIF2795, label %ELSE2824
+  %322 = call float @llvm.minnum.f32(float %319, float %320)
+  %323 = call float @llvm.minnum.f32(float %322, float %321)
+  %324 = fcmp ogt float undef, 0.000000e+00
+  %325 = fcmp olt float undef, 1.000000e+00
+  %326 = and i1 %324, %325
+  %327 = fcmp olt float undef, %323
+  %328 = and i1 %326, %327
+  br i1 %328, label %ENDIF2795, label %ELSE2824
 
 ELSE2824:                                         ; preds = %ELSE2821
   %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
   br label %ENDIF2795
 }
 
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-; Function Attrs: readnone
-declare float @llvm.AMDIL.fraction.(float) #2
+; Function Attrs: nounwind readnone
+declare float @llvm.floor.f32(float) #1
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.sqrt.f32(float) #1
@@ -491,11 +507,7 @@ declare float @llvm.minnum.f32(float, float) #1
 ; Function Attrs: nounwind readnone
 declare float @llvm.maxnum.f32(float, float) #1
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
-
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
new file mode 100644
index 000000000000..5171406469ab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s
+
+; Make sure this doesn't crash.
+; CHECK: {{^}}test:
+; Make sure we are handling hazards correctly.
+; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
+; CHECK-NEXT: s_nop 4
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
+; CHECK: s_endpgm
+define void @test(i32 addrspace(1)* %out, i32 %in) {
+  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
+  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
+  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
+  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
+  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
+  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
+  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
+  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
+  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
+  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
+  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
+  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
+  call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" ()
+  call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" ()
+  call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19_VGPR20_VGPR21_VGPR22_VGPR23}" ()
+  call void asm sideeffect "", "~{VGPR24_VGPR25_VGPR26_VGPR27_VGPR28_VGPR29_VGPR30_VGPR31}" ()
+  call void asm sideeffect "", "~{VGPR32_VGPR33_VGPR34_VGPR35_VGPR36_VGPR37_VGPR38_VGPR39}" ()
+  call void asm sideeffect "", "~{VGPR40_VGPR41_VGPR42_VGPR43_VGPR44_VGPR45_VGPR46_VGPR47}" ()
+  call void asm sideeffect "", "~{VGPR48_VGPR49_VGPR50_VGPR51_VGPR52_VGPR53_VGPR54_VGPR55}" ()
+  call void asm sideeffect "", "~{VGPR56_VGPR57_VGPR58_VGPR59_VGPR60_VGPR61_VGPR62_VGPR63}" ()
+  call void asm sideeffect "", "~{VGPR64_VGPR65_VGPR66_VGPR67_VGPR68_VGPR69_VGPR70_VGPR71}" ()
+  call void asm sideeffect "", "~{VGPR72_VGPR73_VGPR74_VGPR75_VGPR76_VGPR77_VGPR78_VGPR79}" ()
+  call void asm sideeffect "", "~{VGPR80_VGPR81_VGPR82_VGPR83_VGPR84_VGPR85_VGPR86_VGPR87}" ()
+  call void asm sideeffect "", "~{VGPR88_VGPR89_VGPR90_VGPR91_VGPR92_VGPR93_VGPR94_VGPR95}" ()
+  call void asm sideeffect "", "~{VGPR96_VGPR97_VGPR98_VGPR99_VGPR100_VGPR101_VGPR102_VGPR103}" ()
+  call void asm sideeffect "", "~{VGPR104_VGPR105_VGPR106_VGPR107_VGPR108_VGPR109_VGPR110_VGPR111}" ()
+  call void asm sideeffect "", "~{VGPR112_VGPR113_VGPR114_VGPR115_VGPR116_VGPR117_VGPR118_VGPR119}" ()
+  call void asm sideeffect "", "~{VGPR120_VGPR121_VGPR122_VGPR123_VGPR124_VGPR125_VGPR126_VGPR127}" ()
+  call void asm sideeffect "", "~{VGPR128_VGPR129_VGPR130_VGPR131_VGPR132_VGPR133_VGPR134_VGPR135}" ()
+  call void asm sideeffect "", "~{VGPR136_VGPR137_VGPR138_VGPR139_VGPR140_VGPR141_VGPR142_VGPR143}" ()
+  call void asm sideeffect "", "~{VGPR144_VGPR145_VGPR146_VGPR147_VGPR148_VGPR149_VGPR150_VGPR151}" ()
+  call void asm sideeffect "", "~{VGPR152_VGPR153_VGPR154_VGPR155_VGPR156_VGPR157_VGPR158_VGPR159}" ()
+  call void asm sideeffect "", "~{VGPR160_VGPR161_VGPR162_VGPR163_VGPR164_VGPR165_VGPR166_VGPR167}" ()
+  call void asm sideeffect "", "~{VGPR168_VGPR169_VGPR170_VGPR171_VGPR172_VGPR173_VGPR174_VGPR175}" ()
+  call void asm sideeffect "", "~{VGPR176_VGPR177_VGPR178_VGPR179_VGPR180_VGPR181_VGPR182_VGPR183}" ()
+  call void asm sideeffect "", "~{VGPR184_VGPR185_VGPR186_VGPR187_VGPR188_VGPR189_VGPR190_VGPR191}" ()
+  call void asm sideeffect "", "~{VGPR192_VGPR193_VGPR194_VGPR195_VGPR196_VGPR197_VGPR198_VGPR199}" ()
+  call void asm sideeffect "", "~{VGPR200_VGPR201_VGPR202_VGPR203_VGPR204_VGPR205_VGPR206_VGPR207}" ()
+  call void asm sideeffect "", "~{VGPR208_VGPR209_VGPR210_VGPR211_VGPR212_VGPR213_VGPR214_VGPR215}" ()
+  call void asm sideeffect "", "~{VGPR216_VGPR217_VGPR218_VGPR219_VGPR220_VGPR221_VGPR222_VGPR223}" ()
+  call void asm sideeffect "", "~{VGPR224_VGPR225_VGPR226_VGPR227_VGPR228_VGPR229_VGPR230_VGPR231}" ()
+  call void asm sideeffect "", "~{VGPR232_VGPR233_VGPR234_VGPR235_VGPR236_VGPR237_VGPR238_VGPR239}" ()
+  call void asm sideeffect "", "~{VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247}" ()
+  call void asm sideeffect "", "~{VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255}" ()
+
+  store i32 %in, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index bc766dbcac67..0e9618523e32 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -2,7 +2,7 @@
 
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #1
 
 
 @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
@@ -10,14 +10,13 @@ declare void @llvm.AMDGPU.barrier.local() #2
 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
 
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -32,12 +31,12 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
 ; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store volatile i32 99, i32 addrspace(1)* %gptr, align 4
@@ -51,17 +50,17 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 
 ; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
 
   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
-  call void @llvm.AMDGPU.barrier.local() #2
+  call void @llvm.amdgcn.s.barrier() #1
   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
@@ -70,19 +69,18 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
   ret void
 }
 
-; Technically we could reorder these, but just comparing the
-; instruction type of the load is insufficient.
-
-; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_store_dword
-; CI: buffer_load_dword
+; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
+; CI-DAG: buffer_store_dword
+; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
-define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -95,15 +93,17 @@ define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1
 }
 
 ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
+; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -142,7 +142,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
 ; CI: buffer_store_dword
 define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
   %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -155,17 +155,15 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: @reorder_local_offsets
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
 
   store i32 123, i32 addrspace(3)* %ptr1, align 4
   %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
@@ -181,18 +179,17 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
 
   store i32 123, i32 addrspace(1)* %ptr1, align 4
   %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
@@ -211,7 +208,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
 ; XCI: TBUFFER_STORE_FORMAT
 ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
-; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
 ;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
 ;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -232,6 +229,5 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 ;   ret void
 ; }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind convergent }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
index bd427dd3ed46..c7d85a0340cc 100644
--- a/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -90,7 +90,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 
 !opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
 
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 06bee114c23a..30e6bd1e78f2 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; SI-LABEL: {{^}}s_sext_i1_to_i32:
-; SI: v_cndmask_b32_e64
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i1_to_i32:
+; GCN: v_cndmask_b32_e64
+; GCN: s_endpgm
 define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -11,9 +11,9 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpg
+; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
+; GCN: s_ashr_i32
+; GCN: s_endpg
 define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
@@ -23,11 +23,11 @@ entry:
   ret void
 }
 
-; SI-LABEL: {{^}}s_sext_i1_to_i64:
-; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
-; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i1_to_i64:
+; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
+; GCN: s_endpgm
 define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -35,18 +35,18 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: {{^}}s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i32_to_i64:
+; GCN: s_ashr_i32
+; GCN: s_endpgm
 define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}v_sext_i32_to_i64:
-; SI: v_ashr
-; SI: s_endpgm
+; GCN-LABEL: {{^}}v_sext_i32_to_i64:
+; GCN: v_ashr
+; GCN: s_endpgm
 define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
@@ -54,10 +54,112 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
   ret void
 }
 
-; SI-LABEL: {{^}}s_sext_i16_to_i64:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i16_to_i64:
+; GCN: s_endpgm
 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
+; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
+; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
+; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
+
+; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
+
+; GCN-DAG: buffer_store_dword [[VEXT0]]
+; GCN-DAG: buffer_store_dword [[VEXT1]]
+; GCN-DAG: buffer_store_dword [[VEXT2]]
+; GCN-DAG: buffer_store_dword [[VEXT3]]
+
+; GCN: s_endpgm
+define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cast = bitcast i32 %a to <4 x i8>
+  %ext = sext <4 x i8> %cast to <4 x i32>
+  %elt0 = extractelement <4 x i32> %ext, i32 0
+  %elt1 = extractelement <4 x i32> %ext, i32 1
+  %elt2 = extractelement <4 x i32> %ext, i32 2
+  %elt3 = extractelement <4 x i32> %ext, i32 3
+  store volatile i32 %elt0, i32 addrspace(1)* %out
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i32 %elt2, i32 addrspace(1)* %out
+  store volatile i32 %elt3, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
+; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
+; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
+; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
+
+; GCN: buffer_store_dword [[EXT0]]
+; GCN: buffer_store_dword [[EXT1]]
+; GCN: buffer_store_dword [[EXT2]]
+; GCN: buffer_store_dword [[EXT3]]
+define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32, i32 addrspace(1)* %in
+  %cast = bitcast i32 %a to <4 x i8>
+  %ext = sext <4 x i8> %cast to <4 x i32>
+  %elt0 = extractelement <4 x i32> %ext, i32 0
+  %elt1 = extractelement <4 x i32> %ext, i32 1
+  %elt2 = extractelement <4 x i32> %ext, i32 2
+  %elt3 = extractelement <4 x i32> %ext, i32 3
+  store volatile i32 %elt0, i32 addrspace(1)* %out
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i32 %elt2, i32 addrspace(1)* %out
+  store volatile i32 %elt3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: s_bfe_i64
+; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
+; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
+; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-DAG: s_sext_i32_i16
+; GCN-DAG: s_sext_i32_i16
+; GCN: s_endpgm
+define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+  %cast = bitcast i64 %a to <4 x i16>
+  %ext = sext <4 x i16> %cast to <4 x i32>
+  %elt0 = extractelement <4 x i32> %ext, i32 0
+  %elt1 = extractelement <4 x i32> %ext, i32 1
+  %elt2 = extractelement <4 x i32> %ext, i32 2
+  %elt3 = extractelement <4 x i32> %ext, i32 3
+  store volatile i32 %elt0, i32 addrspace(1)* %out
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i32 %elt2, i32 addrspace(1)* %out
+  store volatile i32 %elt3, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
+; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48
+; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: s_endpgm
+define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %a = load i64, i64 addrspace(1)* %in
+  %cast = bitcast i64 %a to <4 x i16>
+  %ext = sext <4 x i16> %cast to <4 x i32>
+  %elt0 = extractelement <4 x i32> %ext, i32 0
+  %elt1 = extractelement <4 x i32> %ext, i32 1
+  %elt2 = extractelement <4 x i32> %ext, i32 2
+  %elt3 = extractelement <4 x i32> %ext, i32 3
+  store volatile i32 %elt0, i32 addrspace(1)* %out
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i32 %elt2, i32 addrspace(1)* %out
+  store volatile i32 %elt3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index a94ccc32e61c..7c58f2d906d4 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_i32_e32
@@ -10,14 +10,14 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   ret void
 }
 
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}sint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 vcc,
 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
 ; uses an SGPR (implicit vcc).
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
-; SI: buffer_store_dwordx2
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI-DAG: v_cmp_eq_i32_e64 vcc,
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
 define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
@@ -52,7 +52,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
   %result = sitofp i64 %val to double
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 138b93b16d8d..16eae1899ec0 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
@@ -13,8 +13,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32:
 ; GCN: {{buffer|flat}}_load_dwordx2
 
-; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63
-; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_xor_b32
 
 ; GCN: v_ffbh_u32
@@ -26,10 +25,10 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; GCN-DAG: v_cmp_lt_u64
 
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]],
-; GCN: {{buffer|flat}}_store_dword [[SIGN_SEL]]
+; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
+; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
 define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
@@ -47,7 +46,7 @@ define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64:
 define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
   %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
@@ -56,7 +55,7 @@ define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrsp
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index 851085c9535d..75ffdd2cc85a 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -103,7 +103,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: v_cmp_eq_i32
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
-; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
 define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
new file mode 100644
index 000000000000..10187f6125d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -0,0 +1,390 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
+  call void @llvm.AMDGPU.kill(float 0.0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
+  call void @llvm.AMDGPU.kill(float -0.0)
+  ret void
+}
+
+; FIXME: Ideally only one would be emitted
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
+  call void @llvm.AMDGPU.kill(float -0.0)
+  call void @llvm.AMDGPU.kill(float -1.0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
+  call void @llvm.AMDGPU.kill(float %x)
+  ret void
+}
+
+; FIXME: Ideally only one would be emitted
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
+  call void @llvm.AMDGPU.kill(float %x)
+  call void @llvm.AMDGPU.kill(float %x)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
+  call void @llvm.AMDGPU.kill(float %x)
+  call void @llvm.AMDGPU.kill(float %y)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
+  call void @llvm.AMDGPU.kill(float %x)
+  %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={VGPR7}"()
+  call void @llvm.AMDGPU.kill(float %y)
+  ret void
+}
+
+; FIXME: why does the skip depend on the asm length in the same block?
+
+; CHECK-LABEL: {{^}}test_kill_control_flow:
+; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#1:
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
+; CHECK-NEXT: ; BB#3:
+; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: s_endpgm
+
+; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
+entry:
+  %cmp = icmp eq i32 %arg, 0
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %var = call float asm sideeffect "
+    v_mov_b32_e64 v7, -1
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64", "={VGPR7}"()
+  call void @llvm.AMDGPU.kill(float %var)
+  br label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
+; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#1: ; %bb
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: ;;#ASMEND
+; CHECK: v_mov_b32_e64 v8, -1
+; CHECK: ;;#ASMEND
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#4:
+; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: s_endpgm
+
+; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
+; CHECK: buffer_store_dword v8
+; CHECK: v_mov_b32_e64 v9, -2
+
+; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
+; CHECK: buffer_store_dword v9
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
+entry:
+  %cmp = icmp eq i32 %arg, 0
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %var = call float asm sideeffect "
+    v_mov_b32_e64 v7, -1
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64", "={VGPR7}"()
+  %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"()
+  call void @llvm.AMDGPU.kill(float %var)
+  store volatile float %live.across, float addrspace(1)* undef
+  %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"()
+  br label %exit
+
+exit:
+  %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
+  store float %phi, float addrspace(1)* undef
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_divergent_loop:
+; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
+; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
+; CHECK-NEXT: ; mask branch [[EXIT]]
+
+; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]:
+
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+
+; CHECK-NEXT: ; BB#3:
+; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
+; CHECK: v_cmp_eq_i32_e32 vcc, 0, [[LOAD]]
+; CHECK-NEXT: s_and_b64 vcc, exec, vcc
+; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
+
+; CHECK-NEXT: {{^}}[[EXIT]]:
+; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]]
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
+entry:
+  %cmp = icmp eq i32 %arg, 0
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %var = call float asm sideeffect "
+    v_mov_b32_e64 v7, -1
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64
+    v_nop_e64", "={VGPR7}"()
+  call void @llvm.AMDGPU.kill(float %var)
+  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %loop.cond = icmp eq i32 %vgpr, 0
+  br i1 %loop.cond, label %bb, label %exit
+
+exit:
+  store volatile i32 8, i32 addrspace(1)* undef
+  ret void
+}
+
+; bug 28550
+; CHECK-LABEL: {{^}}phi_use_def_before_kill:
+; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
+; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: exp
+; CHECK-NEXT: s_endpgm
+
+; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
+; CHECK: s_and_b64 vcc, exec,
+; CHECK-NEXT: s_cbranch_vccz [[PHIBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[PHIBB]]:
+; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: ; BB#3: ; %bb10
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
+; CHECK: buffer_store_dword
+
+; CHECK: [[ENDBB]]:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @phi_use_def_before_kill() #0 {
+bb:
+  %tmp = fadd float undef, 1.000000e+00
+  %tmp1 = fcmp olt float 0.000000e+00, %tmp
+  %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp2)
+  br i1 undef, label %phibb, label %bb8
+
+phibb:
+  %tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ]
+  %tmp6 = fcmp oeq float %tmp5, 0.000000e+00
+  br i1 %tmp6, label %bb10, label %end
+
+bb8:
+  store volatile i32 8, i32 addrspace(1)* undef
+  br label %phibb
+
+bb10:
+  store volatile i32 9, i32 addrspace(1)* undef
+  br label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}no_skip_no_successors:
+; CHECK: v_cmp_nle_f32
+; CHECK: s_and_b64 vcc, exec,
+; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
+
+; CHECK: ; BB#3: ; %bb6
+; CHECK: s_mov_b64 exec, 0
+
+; CHECK: [[SKIPKILL]]:
+; CHECK: v_cmp_nge_f32
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[UNREACHABLE]]:
+; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
+define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
+bb:
+  %tmp = fcmp ult float %arg1, 0.000000e+00
+  %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
+  br i1 %tmp, label %bb6, label %bb3
+
+bb3:                                              ; preds = %bb
+  br i1 %tmp2, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  br i1 true, label %bb5, label %bb7
+
+bb5:                                              ; preds = %bb4, %bb3
+  unreachable
+
+bb6:                                              ; preds = %bb
+  call void @llvm.AMDGPU.kill(float -1.000000e+00)
+  unreachable
+
+bb7:                                              ; preds = %bb4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}if_after_kill_block:
+; CHECK: ; BB#0:
+; CHECK: s_and_saveexec_b64
+; CHECK: s_xor_b64
+; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK: [[BB4]]:
+; CHECK: s_or_b64 exec, exec
+; CHECK: image_sample_c
+
+; CHECK: v_cmp_neq_f32_e32 vcc, 0,
+; CHECK: s_and_b64 exec, exec,
+; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
+; CHECK-NOT: branch
+
+; CHECK: ; BB#3: ; %bb8
+; CHECK: buffer_store_dword
+
+; CHECK: [[END]]:
+; CHECK: s_or_b64 exec, exec
+; CHECK: s_endpgm
+define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 {
+bb:
+  %tmp = fcmp ult float %arg1, 0.000000e+00
+  br i1 %tmp, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb
+  call void @llvm.AMDGPU.kill(float %arg)
+  br label %bb4
+
+bb4:                                              ; preds = %bb3, %bb
+  %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp6 = extractelement <4 x float> %tmp5, i32 0
+  %tmp7 = fcmp une float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb8, label %bb9
+
+bb8:                                              ; preds = %bb9, %bb4
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+bb9:                                              ; preds = %bb4
+  ret void
+}
+
+declare void @llvm.AMDGPU.kill(float) #0
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
new file mode 100644
index 000000000000..9b977fc54630
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -0,0 +1,449 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
+; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp slt i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
+; GCN: v_max_i32
+; GCN: v_min_i32
+define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp slt i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store volatile i32 %i0, i32 addrspace(1)* %outgep
+  store volatile i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
+; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i32 %a, 17
+  %i0 = select i1 %icmp0, i32 %a, i32 17
+
+  %icmp1 = icmp slt i32 %i0, 12
+  %i1 = select i1 %icmp1, i32 %i0, i32 12
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
+; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp slt i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
+; GCN: v_cmp_lt_i64
+; GCN: v_cmp_gt_i64
+define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i64 %a, 12
+  %i0 = select i1 %icmp0, i64 %a, i64 12
+
+  %icmp1 = icmp slt i64 %i0, 17
+  %i1 = select i1 %icmp1, i64 %i0, i64 17
+
+  store i64 %i1, i64 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
+; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i16 %a, 12
+  %i0 = select i1 %icmp0, i16 %a, i16 12
+
+  %icmp1 = icmp slt i16 %i0, 17
+  %i1 = select i1 %icmp1, i16 %i0, i16 17
+
+  store i16 %i1, i16 addrspace(1)* %outgep
+  ret void
+}
+
+
+define internal i32 @smin(i32 %x, i32 %y) #2 {
+  %cmp = icmp slt i32 %x, %y
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define internal i32 @smax(i32 %x, i32 %y) #2 {
+  %cmp = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define internal i16 @smin16(i16 %x, i16 %y) #2 {
+  %cmp = icmp slt i16 %x, %y
+  %sel = select i1 %cmp, i16 %x, i16 %y
+  ret i16 %sel
+}
+
+define internal i16 @smax16(i16 %x, i16 %y) #2 {
+  %cmp = icmp sgt i16 %x, %y
+  %sel = select i1 %cmp, i16 %x, i16 %y
+  ret i16 %sel
+}
+
+define internal i8 @smin8(i8 %x, i8 %y) #2 {
+  %cmp = icmp slt i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+define internal i8 @smax8(i8 %x, i8 %y) #2 {
+  %cmp = icmp sgt i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+
+; FIXME: In these cases we probably should have used scalar operations
+; instead.
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %y, i32 %x)
+  %tmp1 = call i32 @smax(i32 %y, i32 %x)
+  %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
+; GCN: s_sext_i32_i16
+; GCN: s_sext_i32_i16
+; GCN: s_sext_i32_i16
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+bb:
+  %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i8_pat_0:
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+bb:
+  %tmp0 = call i8 @smin8(i8 %x, i8 %y)
+  %tmp1 = call i8 @smax8(i8 %x, i8 %y)
+  %tmp2 = call i8 @smin8(i8 %tmp1, i8 %z)
+  %tmp3 = call i8 @smax8(i8 %tmp0, i8 %tmp2)
+  store i8 %tmp3, i8 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp0, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp1, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp2, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @smin(i32 %x, i32 %y)
+  %tmp1 = call i32 @smax(i32 %x, i32 %y)
+  %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index e646605f7da1..560d5597baa9 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,9 +1,12 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}s_abs_i32:
 ; GCN: s_abs_i32
 ; GCN: s_add_i32
+
+; EG: MAX_INT
 define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -17,6 +20,8 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
 ; GCN: v_add_i32
+
+; EG: MAX_INT
 define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
@@ -32,6 +37,9 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
 ; GCN: s_abs_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
 define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
@@ -46,14 +54,17 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
 }
 
 ; FUNC-LABEL: {{^}}v_abs_v2i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
 
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
 
 ; GCN: v_add_i32
 ; GCN: v_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
 define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
@@ -79,6 +90,11 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
 define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
@@ -97,20 +113,25 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
 }
 
 ; FUNC-LABEL: {{^}}v_abs_v4i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
 
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
 
 ; GCN: v_add_i32
 ; GCN: v_add_i32
 ; GCN: v_add_i32
 ; GCN: v_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
 define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
@@ -128,3 +149,76 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
   store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}s_min_max_i32:
+; GCN: s_load_dword [[VAL0:s[0-9]+]]
+; GCN: s_load_dword [[VAL1:s[0-9]+]]
+
+; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
+; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
+define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+  %cond0 = icmp sgt i32 %val0, %val1
+  %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+  %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_min_max_i32:
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]]
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
+
+; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+  %val0 = load volatile i32, i32 addrspace(1)* %ptr0
+  %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt i32 %val0, %val1
+  %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+  %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_min_max_v4i32:
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+  %cond0 = icmp sgt <4 x i32> %val0, %val1
+  %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
+  %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
+
+  store volatile <4 x i32> %sel0, <4 x i32> addrspace(1)* %out0, align 4
+  store volatile <4 x i32> %sel1, <4 x i32> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_min_max_i32_user:
+; GCN: v_cmp_gt_i32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
+define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+  %val0 = load volatile i32, i32 addrspace(1)* %ptr0
+  %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt i32 %val0, %val1
+  %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+  %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+  store volatile i1 %cond0, i1 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
new file mode 100644
index 000000000000..ddac8a006c86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s
+
+; GCN-FUNC: {{^}}vccz_workaround:
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
+; GCN: v_cmp_neq_f32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0
+; GCN: s_and_b64 vcc, exec, [[MASK]]
+; GCN: s_waitcnt lgkmcnt(0)
+; VCCZ-BUG: s_mov_b64 vcc, vcc
+; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: buffer_store_dword
+; GCN: [[EXIT]]:
+; GCN: s_endpgm
+define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+entry:
+  %cnd = fcmp oeq float 0.0, %cond
+  %sgpr = load volatile i32, i32 addrspace(2)* %in
+  br i1 %cnd, label %if, label %endif
+
+if:
+  store i32 %sgpr, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-FUNC: {{^}}vccz_noworkaround:
+; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: buffer_store_dword
+; GCN: [[EXIT]]:
+; GCN: s_endpgm
+define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %vgpr = load volatile float, float addrspace(1)* %in
+  %cnd = fcmp oeq float 0.0, %vgpr
+  br i1 %cnd, label %if, label %endif
+
+if:
+  store float %vgpr, float addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 1d6bb9ece8c6..476da9486dff 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -88,7 +88,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -102,7 +102,7 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -118,7 +118,7 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -133,7 +133,7 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -148,7 +148,7 @@ main_body:
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -158,9 +158,8 @@ main_body:
 }
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index c91a44cf60e5..cc4b6bcbfb51 100644
--- a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -6,7 +6,8 @@
 ; TONGA-LABEL: test
 define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
 entry:
-  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
   %a = load <256 x i32>, <256 x i32> addrspace(1)* %aptr
   call void asm sideeffect "", "~{memory}" ()
@@ -21,4 +22,7 @@ entry:
   ret void
 }
 
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 4a12ed545b81..9b3dfab2be6a 100644
--- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -1,5 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s
+; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s
+ ;
+; There is something about Tonga that causes this test to spend a lot of time
+; in the default register allocator.
+
 
 ; When the offset of VGPR spills into scratch space gets too large, an additional SGPR
 ; is used to calculate the scratch load/store address. Make sure that this
@@ -7,10 +11,10 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in,
-                  <96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) {
+define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
-  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 
   %aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
   %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
@@ -24,10 +28,13 @@ entry:
   call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
   call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
 
-  %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
+  %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid
   store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
 
   ret void
 }
 
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 9e181bc14d9d..d4e2dc814050 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
 
 ; This is broken because the low half of the 64-bit add remains on the
 ; SALU, but the upper half does not. The addc expects the carry bit
@@ -62,7 +62,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
 define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll
new file mode 100644
index 000000000000..237a62c1360a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
+; Make sure that when we split an smrd instruction in order to move it to
+; the VALU, we are also moving its users to the VALU.
+; CHECK-LABEL: {{^}}split_smrd_add_worklist:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+
+define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp1 = bitcast float %tmp to i32
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  %tmp4 = bitcast float %tmp to i32
+  %tmp5 = add i32 %tmp4, 4
+  %tmp6 = sext i32 %tmp5 to i64
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare i32 @llvm.SI.packf16(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
+!2 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index 4c82ed6affc2..484150bc25fd 100644
--- a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+; XFAIL: *
 
 @sPrivateStorage = external addrspace(3) global [256 x [8 x <4 x i64>]]
 
@@ -34,14 +35,14 @@ define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly
 entry:
   %tmp = tail call i32 @llvm.r600.read.local.size.y()
   %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
-  %tmp2 = tail call i32 @llvm.r600.read.tidig.x()
-  %tmp3 = tail call i32 @llvm.r600.read.tidig.y()
-  %tmp4 = tail call i32 @llvm.r600.read.tidig.z()
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp3 = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tmp4 = tail call i32 @llvm.amdgcn.workitem.id.z()
   %tmp6 = mul i32 %tmp2, %tmp
   %tmp10 = add i32 %tmp3, %tmp6
   %tmp11 = mul i32 %tmp10, %tmp1
   %tmp9 = add i32 %tmp11, %tmp4
-  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %x.i.12.i = tail call i32 @llvm.r600.read.local.size.x() #1
   %mul.26.i = mul i32 %x.i.12.i, %x.i.i
   %add.i = add i32 %tmp2, %mul.26.i
@@ -80,13 +81,13 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.local.size.x() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.local.size.y() #1
@@ -95,10 +96,10 @@ declare i32 @llvm.r600.read.local.size.y() #1
 declare i32 @llvm.r600.read.local.size.z() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.z() #1
+declare i32 @llvm.amdgcn.workitem.id.z() #1
 
 attributes #0 = { norecurse nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 3b59bbfb18c0..dddfbfd3ed10 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,213 +1,258 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-;EG-LABEL: {{^}}ashr_v2i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+declare i32 @llvm.r600.read.tidig.x() #0
 
-;SI-LABEL: {{^}}ashr_v2i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}ashr_v2i32:
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-;VI-LABEL: {{^}}ashr_v2i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
   %result = ashr <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-LABEL: {{^}}ashr_v4i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}ashr_v4i32:
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-;SI-LABEL: {{^}}ashr_v4i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
   %result = ashr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-LABEL: {{^}}ashr_i64:
-;EG: ASHR
-
-;SI-LABEL: {{^}}ashr_i64:
-;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
-
-;VI-LABEL: {{^}}ashr_i64:
-;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+; FUNC-LABEL: {{^}}s_ashr_i64:
+; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
-define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+; EG: ASHR
+define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
-  %0 = sext i32 %in to i64
-  %1 = ashr i64 %0, 8
-  store i64 %1, i64 addrspace(1)* %out
+  %in.ext = sext i32 %in to i64
+  %ashr = ashr i64 %in.ext, 8
+  store i64 %ashr, i64 addrspace(1)* %out
   ret void
 }
 
-;EG-LABEL: {{^}}ashr_i64_2:
-;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
-;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-
-;SI-LABEL: {{^}}ashr_i64_2:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_i64_2:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}ashr_i64_2:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
+; EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+; EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1) * %in
-  %b = load i64, i64 addrspace(1) * %b_ptr
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
   %result = ashr i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;EG-LABEL: {{^}}ashr_v2i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v2i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v2i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}ashr_v2i64:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: ASHR {{.*}}, [[SHA]]
+; EG-DAG: ASHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
 define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
   %result = ashr <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-;EG-LABEL: {{^}}ashr_v4i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: ASHR {{.*}}, [[SHC]]
-;EG-DAG: ASHR {{.*}}, [[SHD]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v4i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FIXME: Broken on r600
+; XFUNC-LABEL: {{^}}s_ashr_v2i64:
+; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
+; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
+; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+;   %result = ashr <2 x i64> %a, %b
+;   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ashr_v4i64:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: ASHR {{.*}}, [[SHA]]
+; EG-DAG: ASHR {{.*}}, [[SHB]]
+; EG-DAG: ASHR {{.*}}, [[SHC]]
+; EG-DAG: ASHR {{.*}}, [[SHD]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
 define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
   %result = ashr <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}s_ashr_32_i64:
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
+; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
+define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = ashr i64 %a, 32
+  %add = add i64 %result, %b
+  store i64 %add, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_32_i64:
+; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; VI: flat_load_dword v[[HI:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
+define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = ashr i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_ashr_63_i64:
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
+; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
+; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
+define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = ashr i64 %a, 63
+  %add = add i64 %result, %b
+  store i64 %add, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_63_i64:
+; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; VI: flat_load_dword v[[HI:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
+; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
+define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = ashr i64 %a, 63
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
index ba4049f28a6e..57a93ccd2505 100644
--- a/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
 
 ; This test is for a bug in the machine scheduler where stores without
 ; an underlying object would be moved across the barrier.  In this
@@ -12,16 +12,16 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
+define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
   %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
   %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
-  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
+  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1
   %tmp16 = add i32 %tmp13, 1
   %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
-  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
-  tail call void @llvm.AMDGPU.barrier.local() #2
+  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1
+  tail call void @llvm.amdgcn.s.barrier()
   %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
   %tmp26 = sext i32 %tmp25 to i64
   %tmp27 = sext i32 %arg4 to i64
@@ -37,6 +37,7 @@ bb:
 }
 
 ; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #1
 
-attributes #2 = { convergent nounwind }
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index e0c554ad2c17..b4d7505e0a8a 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -1,29 +1,128 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; SI-LABEL: {{^}}global_store_v3i64:
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}global_store_v3i64:
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: {{^}}global_store_v3i64_unaligned:
+; GCN-LABEL: {{^}}global_store_v3i64_unaligned:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
 define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}local_store_v3i64:
+; GCN-LABEL: {{^}}local_store_v3i64:
+; GCN: ds_write_b64
+; GCN: ds_write_b64
+; GCN: ds_write_b64
 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
 
-; SI-LABEL: {{^}}local_store_v3i64_unaligned:
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+; GCN-LABEL: {{^}}local_store_v3i64_unaligned:
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
+; GCN-DAG: buffer_store_dwordx2
+; GCN-DAG: buffer_store_dword v
+define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+  %trunc = trunc <3 x i64> %x to <3 x i32>
+  store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_dword v
+define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+  %trunc = trunc <3 x i64> %x to <3 x i16>
+  store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_byte v
+define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+  %trunc = trunc <3 x i64> %x to <3 x i8>
+  store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i1:
+; GCN-DAG: buffer_store_byte v
+; GCN-DAG: buffer_store_byte v
+; GCN-DAG: buffer_store_byte v
+define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+  %trunc = trunc <3 x i64> %x to <3 x i1>
+  store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
index d22f43fa05ef..68c659fdd2ff 100644
--- a/test/CodeGen/AMDGPU/store.ll
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -77,12 +77,31 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}store_i24:
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_short
+define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+entry:
+  store i24 %in, i24 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i25:
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
+; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
+; SI: buffer_store_dword [[VAND]]
+define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+entry:
+  store i25 %in, i25 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}store_v2i8:
 ; EG: MEM_RAT MSKOR
 ; EG-NOT: MEM_RAT MSKOR
 
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_short
 define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
@@ -96,8 +115,7 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
-; SI: buffer_store_short
-; SI: buffer_store_short
+; SI: buffer_store_dword
 define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
@@ -110,10 +128,7 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dword
 define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
@@ -135,17 +150,9 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
 }
 
 ; FUNC-LABEL: {{^}}store_v4i16:
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG-NOT: MEM_RAT MSKOR
+; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI-NOT: buffer_store_byte
+; SI: buffer_store_dwordx2
 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
@@ -239,8 +246,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
 
 ; CM: LDS_WRITE
 
-; SI: ds_write_b16
-; SI: ds_write_b16
+; SI: ds_write_b32
 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
@@ -252,10 +258,7 @@ entry:
 
 ; CM: LDS_WRITE
 
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI: ds_write_b32
 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
@@ -287,8 +290,7 @@ entry:
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
@@ -358,20 +360,13 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 ; When i128 was a legal type this program generated cannot select errors:
 
 ; FUNC-LABEL: {{^}}"i128-const-store":
-; FIXME: We should be able to to this with one store instruction
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 1
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
+
 ; SI: buffer_store_dwordx4
 define void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
@@ -384,3 +379,5 @@ entry:
   store i32 2, i32 addrspace(1)* %arrayidx6, align 4
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
index 02e592e9a559..174e64e2cf8b 100644
--- a/test/CodeGen/AMDGPU/structurize.ll
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -r600-ir-structurize=0 | FileCheck %s
 ; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
 ;
 ;                            entry
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
index 77432c1f9d2b..db0f50247e38 100644
--- a/test/CodeGen/AMDGPU/structurize1.ll
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -r600-if-convert=0 < %s | FileCheck %s
 
 ; This tests for abug where the AMDILCFGStructurizer was crashing on loops
 ; like this:
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 9f9446a4e608..5a026cdf2990 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -58,13 +58,11 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
 ; SI: s_sub_u32
 ; SI: s_subb_u32
 
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
+; EG-DAG: SUB_INT {{[* ]*}}
 define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -75,13 +73,11 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
 
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
+; EG-DAG: SUB_INT {{[* ]*}}
 define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
@@ -110,13 +106,13 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
 }
 
 ; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
 define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index c4dae4736cfa..ec2ed78b4954 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -44,7 +44,7 @@ for.inc.1:                                        ; preds = %do.body.1562.prehea
 
 ; SI-LABEL: {{^}}foo:
 ; SI: s_endpgm
-define void @foo() #0 {
+define amdgpu_ps void @foo() #0 {
 bb:
   br i1 undef, label %bb2, label %bb1
 
@@ -67,7 +67,7 @@ bb7:                                              ; preds = %bb6
   br label %bb4
 
 bb9:                                              ; preds = %bb2
-  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+  %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tmp11 = extractelement <4 x float> %tmp10, i32 1
   %tmp12 = extractelement <4 x float> %tmp10, i32 3
   br label %bb14
@@ -98,12 +98,12 @@ bb27:                                             ; preds = %bb24
 }
 
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.SI.packf16(float, float) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index ac9bedb2f8b5..4b6f65a77b9a 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -7,23 +7,25 @@ target triple="amdgcn--"
 ; CHECK: s_load_dword s2, s[0:1], 0x9
 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; CHECK: v_mbcnt_lo_u32_b32_e64
+; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
 ; BB0_1:
-; CHECK: s_load_dword s6, s[0:1], 0xa
+; CHECK: s_load_dword s0, s[0:1], 0xa
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
 ; BB0_2:
 ; CHECK: s_or_b64 exec, exec, s[2:3]
 ; CHECK-NEXT: s_mov_b32 s7, 0xf000
 ; CHECK-NEXT: s_mov_b32 s6, -1
-; CHECK-NEXT: buffer_store_dword v1, s[4:7], 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
 define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
-  br i1 undef, label %ift, label %ife
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %cnd = icmp eq i32 %tid, 0
+  br i1 %cnd, label %ift, label %ife
 
 ift:
   %v1 = insertelement <4 x float> undef, float %a1, i32 0
@@ -35,3 +37,7 @@ ife:
   store float %v2, float addrspace(1)* %out, align 4
   ret void
 }
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/swizzle-export.ll b/test/CodeGen/AMDGPU/swizzle-export.ll
index 000ee2faa478..7cf380520d42 100644
--- a/test/CodeGen/AMDGPU/swizzle-export.ll
+++ b/test/CodeGen/AMDGPU/swizzle-export.ll
@@ -6,7 +6,7 @@
 ;EG: EXPORT T{{[0-9]+}}.XXWX
 ;EG: EXPORT T{{[0-9]+}}.XXXW
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -68,27 +68,27 @@ main_body:
   %57 = insertelement <4 x float> %56, float %1, i32 1
   %58 = insertelement <4 x float> %57, float %2, i32 2
   %59 = insertelement <4 x float> %58, float %3, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %59, i32 60, i32 1)
   %60 = insertelement <4 x float> undef, float %10, i32 0
   %61 = insertelement <4 x float> %60, float %13, i32 1
   %62 = insertelement <4 x float> %61, float %16, i32 2
   %63 = insertelement <4 x float> %62, float %19, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %63, i32 0, i32 2)
   %64 = insertelement <4 x float> undef, float %22, i32 0
   %65 = insertelement <4 x float> %64, float %25, i32 1
   %66 = insertelement <4 x float> %65, float %28, i32 2
   %67 = insertelement <4 x float> %66, float %31, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %67, i32 1, i32 2)
   %68 = insertelement <4 x float> undef, float %34, i32 0
   %69 = insertelement <4 x float> %68, float %37, i32 1
   %70 = insertelement <4 x float> %69, float %40, i32 2
   %71 = insertelement <4 x float> %70, float %43, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %71, i32 2, i32 2)
   %72 = insertelement <4 x float> undef, float %46, i32 0
   %73 = insertelement <4 x float> %72, float %49, i32 1
   %74 = insertelement <4 x float> %73, float %52, i32 2
   %75 = insertelement <4 x float> %74, float %55, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %75, i32 3, i32 2)
   ret void
 }
 
@@ -96,7 +96,7 @@ main_body:
 ; EG: T{{[0-9]+}}.XY__
 ; EG: T{{[0-9]+}}.ZXY0
 
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -111,19 +111,18 @@ main_body:
   %10 = extractelement <4 x float> %9, i32 1
   %11 = insertelement <4 x float> undef, float %2, i32 0
   %12 = insertelement <4 x float> %11, float %3, i32 1
-  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+  call void @llvm.r600.store.swizzle(<4 x float> %12, i32 60, i32 1)
   %13 = insertelement <4 x float> undef, float %6, i32 0
   %14 = insertelement <4 x float> %13, float %8, i32 1
   %15 = insertelement <4 x float> %14, float %10, i32 2
   %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
+  call void @llvm.r600.store.swizzle(<4 x float> %16, i32 0, i32 2)
   ret void
 }
 
 ; Function Attrs: nounwind readonly
 declare float @llvm.cos.f32(float) #1
 
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="1" }
 attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll
new file mode 100644
index 000000000000..c1662acbf2a0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+
+; VI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.wb() #0
+
+; CHECK-LABEL: {{^}}target_none:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_none() #0 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_tahiti:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_tahiti() #1 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_bonaire:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_dcache_inv_vol
+define void @target_bonaire() #3 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  call void @llvm.amdgcn.s.dcache.inv.vol()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_fiji:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
+; CHECK: flat_store_dword
+; CHECK: s_dcache_wb{{$}}
+define void @target_fiji() #4 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  call void @llvm.amdgcn.s.dcache.wb()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_enabled:
+; CHECK: ds_read_b32
+; CHECK: ; LDSByteSize: 5120
+define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %load = load i32, i32* %arrayidx1
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_disabled:
+; CHECK: SCRATCH_RSRC_DWORD0
+; CHECK: SCRATCH_RSRC_DWORD1
+; CHECK: ScratchSize: 24
+define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %load = load i32, i32* %arrayidx1
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+attributes #3 = { nounwind "target-cpu"="bonaire" }
+attributes #4 = { nounwind "target-cpu"="fiji" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }
diff --git a/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
index cbb9c50974a4..2420286f766e 100644
--- a/test/CodeGen/AMDGPU/tex-clause-antidep.ll
+++ b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
@@ -3,7 +3,7 @@
 ;CHECK: TEX
 ;CHECK-NEXT: ALU
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_vs void @test(<4 x float> inreg %reg0) {
   %1 = extractelement <4 x float> %reg0, i32 0
   %2 = extractelement <4 x float> %reg0, i32 1
   %3 = extractelement <4 x float> %reg0, i32 2
@@ -12,14 +12,12 @@ define void @test(<4 x float> inreg %reg0) #0 {
   %6 = insertelement <4 x float> %5, float %2, i32 1
   %7 = insertelement <4 x float> %6, float %3, i32 2
   %8 = insertelement <4 x float> %7, float %4, i32 3
-  %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %9 = call <4 x float> @llvm.r600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %10 = call <4 x float> @llvm.r600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %11 = fadd <4 x float> %9, %10
-  call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %11, i32 0, i32 0)
   ret void
 }
 
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-\ No newline at end of file
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/texture-input-merge.ll b/test/CodeGen/AMDGPU/texture-input-merge.ll
index 789538af5821..a56a5ca39dcc 100644
--- a/test/CodeGen/AMDGPU/texture-input-merge.ll
+++ b/test/CodeGen/AMDGPU/texture-input-merge.ll
@@ -2,7 +2,7 @@
 
 ;CHECK-NOT: MOV
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_vs void @test(<4 x float> inreg %reg0) {
   %1 = extractelement <4 x float> %reg0, i32 0
   %2 = extractelement <4 x float> %reg0, i32 1
   %3 = extractelement <4 x float> %reg0, i32 2
@@ -16,16 +16,14 @@ define void @test(<4 x float> inreg %reg0) #0 {
   %11 = insertelement <4 x float> undef, float %7, i32 0
   %12 = insertelement <4 x float> %11, float %5, i32 1
   %13 = insertelement <4 x float> undef, float %8, i32 0
-  %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %14 = call <4 x float> @llvm.r600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %15 = call <4 x float> @llvm.r600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %16 = call <4 x float> @llvm.r600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %17 = fadd <4 x float> %14, %15
   %18 = fadd <4 x float> %17, %16
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
+  call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 0)
   ret void
 }
 
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-\ No newline at end of file
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
new file mode 100644
index 000000000000..1555cfe39b1e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+; GCN: warning: <unknown>:0:0: in function trap void (): trap handler not supported
+
+declare void @llvm.trap() #0
+
+; GCN-LABEL: {{^}}trap:
+; GCN: s_endpgm
+; GCN-NEXT: s_endpgm
+define void @trap() {
+  call void @llvm.trap()
+  ret void
+}
+
+attributes #0 = { nounwind noreturn }
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
new file mode 100644
index 000000000000..9e2373c55e35
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %bc = bitcast <2 x i32> %ld to i64
+  %trunc = trunc i64 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+  %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  %bc = bitcast <3 x i32> %ld to i96
+  %trunc = trunc i96 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %bc = bitcast <4 x i32> %ld to i128
+  %trunc = trunc i128 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %bc = bitcast <2 x i16> %ld to i32
+  %trunc = trunc i32 %bc to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %bc = bitcast <4 x i16> %ld to i64
+  %trunc = trunc i64 %bc to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
+; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %bc = bitcast <2 x i8> %ld to i16
+  %trunc = trunc i16 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %bc = bitcast <4 x i8> %ld to i32
+  %trunc = trunc i32 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %bc = bitcast <3 x i8> %ld to i24
+  %trunc = trunc i24 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index dac74728b3ce..6d820dbd2692 100644
--- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -4,8 +4,7 @@
 ; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
+; SI: v_cmp_eq_i32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
 define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
@@ -23,7 +22,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -46,7 +45,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -58,7 +57,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
@@ -82,7 +81,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
@@ -94,7 +93,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -120,7 +119,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -159,7 +158,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
 ; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
-; SI-NEXT: buffer_store_byte
+; SI: buffer_store_byte
 define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in
   %masked = and i8 %load, 255
diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll
index 4ba815f26690..cf5c00e65b7d 100644
--- a/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/test/CodeGen/AMDGPU/trunc-store.ll
@@ -2,22 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dwordx4
 define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
@@ -25,22 +10,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dwordx4
 define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index ad52d0f2e238..dbd07fee6bbe 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -37,8 +37,8 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
 ; SI: s_addc_u32
 ; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32
 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: v_mov_b32_e32
 ; SI: buffer_store_dword v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index 2a09e0b20498..f72c22095e4a 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv_i32:
 ; EG-NOT: SETGE_INT
@@ -91,3 +91,57 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_udiv_i8:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = udiv i8 %num, %den
+  %result.ext = zext i8 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i16:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in
+  %den = load i16, i16 addrspace(1) * %den_ptr
+  %result = udiv i16 %num, %den
+  %result.ext = zext i16 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i23:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+  %num = load i23, i23 addrspace(1) * %in
+  %den = load i23, i23 addrspace(1) * %den_ptr
+  %result = udiv i23 %num, %den
+  %result.ext = zext i23 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i24:
+; SI-NOT: v_rcp_f32
+define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+  %num = load i24, i24 addrspace(1) * %in
+  %den = load i24, i24 addrspace(1) * %den_ptr
+  %result = udiv i24 %num, %den
+  %result.ext = zext i24 %result to i32
+  store i32 %result.ext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index f692b7dfdc27..268f3c764d6e 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
@@ -51,11 +51,11 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
-  store i32 %result0, i32 addrspace(1)* %out
+  store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
-  store i32 %result1, i32 addrspace(1)* %out
+  store i32 %result1, i32 addrspace(1)* %out1
   ret void
 }
 
@@ -107,50 +107,54 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]]
+; For SI, we used to have checks for the input and output registers
+; of the instructions, but these are way too fragile.  The division for
+; the two vector elements can be intermixed which makes it impossible to
+; accurately check all the operands.
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
index 4de881b66f10..147b95560935 100644
--- a/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv24_i8:
@@ -40,7 +40,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}udiv24_i32:
+; FUNC-LABEL: {{^}}udiv23_i32:
 ; SI: v_cvt_f32_u32
 ; SI-DAG: v_cvt_f32_u32
 ; SI-DAG: v_rcp_f32
@@ -50,6 +50,23 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
+define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i23.0 = shl i32 %num, 9
+  %den.i23.0 = shl i32 %den, 9
+  %num.i23 = lshr i32 %num.i23.0, 9
+  %den.i23 = lshr i32 %den.i23.0, 9
+  %result = udiv i32 %num.i23, %den.i23
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
 define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
@@ -63,6 +80,40 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
+; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i23.0 = shl i32 %num, 9
+  %den.i24.0 = shl i32 %den, 8
+  %num.i23 = lshr i32 %num.i23.0, 9
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i23, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i23.0 = shl i32 %den, 9
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i23 = lshr i32 %den.i23.0, 9
+  %result = udiv i32 %num.i24, %den.i23
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}udiv25_i32:
 ; RCP_IFLAG is for URECIP in the full 32b alg
 ; SI: v_rcp_iflag
@@ -74,11 +125,11 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
-  %num.i24.0 = shl i32 %num, 7
-  %den.i24.0 = shl i32 %den, 7
-  %num.i24 = lshr i32 %num.i24.0, 7
-  %den.i24 = lshr i32 %den.i24.0, 7
-  %result = udiv i32 %num.i24, %den.i24
+  %num.i25.0 = shl i32 %num, 7
+  %den.i25.0 = shl i32 %den, 7
+  %num.i25 = lshr i32 %num.i25.0, 7
+  %den.i25 = lshr i32 %den.i25.0, 7
+  %result = udiv i32 %num.i25, %den.i25
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -162,15 +213,8 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}urem24_i32:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
+; SI-NOT: v_rcp_f32
+; EG-NOT: RECIP_IEEE
 define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
@@ -243,3 +287,41 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i16.0 = shl i32 %num, 16
+  %den.i23.0 = shl i32 %den, 9
+  %num.i16 = lshr i32 %num.i16.0, 16
+  %den.i23 = lshr i32 %den.i23.0, 9
+  %result = udiv i32 %num.i16, %den.i23
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i23.0 = shl i32 %num, 9
+  %den.i16.0 = shl i32 %den, 16
+  %num.i23 = lshr i32 %num.i23.0, 9
+  %den.i16 = lshr i32 %den.i16.0, 16
+  %result = udiv i32 %num.i23, %den.i16
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
index 9f3069bdf80c..72e6af9a6eea 100644
--- a/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -1,4 +1,4 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
@@ -184,7 +184,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_udiv2464:
+;FUNC-LABEL: {{^}}test_udiv2364:
 ;EG: UINT_TO_FLT
 ;EG: UINT_TO_FLT
 ;EG: FLT_TO_UINT
@@ -195,15 +195,15 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 40
-  %2 = lshr i64 %y, 40
+define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 41
+  %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_urem2464:
+;FUNC-LABEL: {{^}}test_urem2364:
 ;EG: UINT_TO_FLT
 ;EG: UINT_TO_FLT
 ;EG: FLT_TO_UINT
@@ -214,9 +214,9 @@ define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
-  %1 = lshr i64 %x, 40
-  %2 = lshr i64 %y, 40
+define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 41
+  %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 65fe580792a5..b36ce6b8d6ca 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
   %result = uitofp i64 %val to double
@@ -70,14 +70,14 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
   ret void
 }
 
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; SI: v_cmp_eq_i32_e64 vcc
 ; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
 ; uses an SGPR (implicit vcc).
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
-; SI: buffer_store_dwordx2
+
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
+; SI-DAG: v_cmp_eq_i32_e64 vcc
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
 define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 3ab11442d5cc..27c41e41a0e7 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -22,7 +22,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; GCN-DAG: v_cmp_lt_u64
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
-; GCN: {{buffer|flat}}_store_dword [[VR]]
+; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
 define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index a3343d1e2d9c..0c3d54cf0d09 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -103,7 +103,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: v_cmp_eq_i32
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
-; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
 define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
new file mode 100644
index 000000000000..a26eb8f9ada9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -0,0 +1,484 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
+; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp ult i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
+; GCN: v_max_u32
+; GCN: v_min_u32
+define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp ult i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store volatile i32 %i0, i32 addrspace(1)* %outgep
+  store volatile i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
+; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i32 %a, 17
+  %i0 = select i1 %icmp0, i32 %a, i32 17
+
+  %icmp1 = icmp ult i32 %i0, 12
+  %i1 = select i1 %icmp1, i32 %i0, i32 12
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
+; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0
+
+  %icmp0 = icmp sgt i32 %a, 12
+  %i0 = select i1 %icmp0, i32 %a, i32 12
+
+  %icmp1 = icmp ult i32 %i0, 17
+  %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+  store i32 %i1, i32 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
+; GCN: v_cmp_lt_u64
+; GCN: v_cmp_gt_u64
+define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i64 %a, 12
+  %i0 = select i1 %icmp0, i64 %a, i64 12
+
+  %icmp1 = icmp ult i64 %i0, 17
+  %i1 = select i1 %icmp1, i64 %i0, i64 17
+
+  store i64 %i1, i64 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
+; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+
+  %icmp0 = icmp ugt i16 %a, 12
+  %i0 = select i1 %icmp0, i16 %a, i16 12
+
+  %icmp1 = icmp ult i16 %i0, 17
+  %i1 = select i1 %icmp1, i16 %i0, i16 17
+
+  store i16 %i1, i16 addrspace(1)* %outgep
+  ret void
+}
+
+define internal i32 @umin(i32 %x, i32 %y) #2 {
+  %cmp = icmp ult i32 %x, %y
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define internal i32 @umax(i32 %x, i32 %y) #2 {
+  %cmp = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define internal i16 @umin16(i16 %x, i16 %y) #2 {
+  %cmp = icmp ult i16 %x, %y
+  %sel = select i1 %cmp, i16 %x, i16 %y
+  ret i16 %sel
+}
+
+define internal i16 @umax16(i16 %x, i16 %y) #2 {
+  %cmp = icmp ugt i16 %x, %y
+  %sel = select i1 %cmp, i16 %x, i16 %y
+  ret i16 %sel
+}
+
+define internal i8 @umin8(i8 %x, i8 %y) #2 {
+  %cmp = icmp ult i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+define internal i8 @umax8(i8 %x, i8 %y) #2 {
+  %cmp = icmp ugt i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+
+; FIXME: In these cases we probably should have used scalar operations
+; instead.
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %y, i32 %x)
+  %tmp1 = call i32 @umax(i32 %y, i32 %x)
+  %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+  %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i16_pat_0:
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+bb:
+  %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i8_pat_0:
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+bb:
+  %tmp0 = call i8 @umin8(i8 %x, i8 %y)
+  %tmp1 = call i8 @umax8(i8 %x, i8 %y)
+  %tmp2 = call i8 @umin8(i8 %tmp1, i8 %z)
+  %tmp3 = call i8 @umax8(i8 %tmp0, i8 %tmp2)
+  store i8 %tmp3, i8 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp0, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp1, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp2, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 1, i32 %y)
+  %tmp1 = call i32 @umax(i32 1, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 2)
+  %tmp1 = call i32 @umax(i32 %x, i32 2)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
+define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+  %tmp0 = call i32 @umin(i32 %x, i32 %y)
+  %tmp1 = call i32 @umax(i32 %x, i32 %y)
+  %tmp2 = call i32 @umin(i32 %tmp1, i32 9)
+  %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+  store i32 %tmp3, i32 addrspace(1)* %arg
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 82d88ebd3ae7..129748afd938 100644
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,62 +1,97 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
 
-; SI-LABEL: {{^}}unaligned_load_store_i16_local:
+; SI-LABEL: {{^}}local_unaligned_load_store_i16:
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_i16_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI-LABEL: {{^}}global_unaligned_load_store_i16:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_ushort
+; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
+define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; FUNC-LABEL: {{^}}local_unaligned_load_store_i32:
+
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
+; SI-NOT: v_or
+; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_i32_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+; SI-LABEL: {{^}}global_unaligned_load_store_i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dword
+; UNALIGNED: buffer_store_dword
+define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI-LABEL: {{^}}global_align2_load_store_i32:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+
+; UNALIGNED: buffer_load_dword
+; UNALIGNED: buffer_store_dword
+define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+  %v = load i32, i32 addrspace(1)* %p, align 2
+  store i32 %v, i32 addrspace(1)* %r, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_align2_load_store_i32:
+; GCN: ds_read_u16
+; GCN: ds_read_u16
+; GCN: ds_write_b16
+; GCN: ds_write_b16
+define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+  %v = load i32, i32 addrspace(3)* %p, align 2
+  store i32 %v, i32 addrspace(3)* %r, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_unaligned_load_store_i64:
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
@@ -65,139 +100,263 @@ define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
+
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}unaligned_load_store_i64_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
-  %v = load i64, i64 addrspace(1)* %p, align 1
-  store i64 %v, i64 addrspace(1)* %r, align 1
-  ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
+; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
-
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 
 ; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
 ; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+  %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
+  store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}global_align2_load_store_i64:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+
+; UNALIGNED: buffer_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(1)* %p, align 2
+  store i64 %v, i64 addrspace(1)* %r, align 2
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(1)* %p, align 1
+  store i64 %v, i64 addrspace(1)* %r, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_unaligned_load_store_v4i32:
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: s_endpgm
+define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
 }
 
-; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
-; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dwordx4
+; UNALIGNED: buffer_store_dwordx4
+define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}load_lds_i64_align_4:
-; SI: ds_read2_b32
-; SI: s_endpgm
-define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_4:
+; GCN: ds_read2_b32
+define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
+; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; FUNC-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
+; GCN: s_endpgm
+define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -206,49 +365,191 @@ define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture
   ret void
 }
 
-; SI-LABEL: {{^}}load_lds_i64_align_1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-
-define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_1:
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: store_dwordx2
+define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}store_lds_i64_align_4:
-; SI: ds_write2_b32
-; SI: s_endpgm
-define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+; FUNC-LABEL: {{^}}local_store_i64_align_4:
+; GCN: ds_write2_b32
+define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; GCN: s_endpgm
+define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; FUNC-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN: s_endpgm
+define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
   store i64 0, i64 addrspace(3)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}constant_unaligned_load_i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: s_load_dword
+
+; SI: buffer_store_dword
+define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+  %v = load i32, i32 addrspace(2)* %p, align 1
+  store i32 %v, i32 addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i32:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; UNALIGNED: s_load_dword
+; UNALIGNED: buffer_store_dword
+define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+  %v = load i32, i32 addrspace(2)* %p, align 2
+  store i32 %v, i32 addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i64:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; UNALIGNED: s_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(2)* %p, align 2
+  store i64 %v, i64 addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_i64:
+; SI: s_load_dwordx2
+; SI: buffer_store_dwordx2
+define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+  %v = load i64, i64 addrspace(2)* %p, align 4
+  store i64 %v, i64 addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_v4i32:
+; SI: s_load_dwordx4
+; SI: buffer_store_dwordx4
+define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+  %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: buffer_load_dwordx2
+
+; SI: buffer_store_dwordx2
+define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+  %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
+  store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: buffer_load_dwordx4
+
+; SI: buffer_store_dwordx4
+define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+  %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+  %v = load i8, i8 addrspace(2)* %p, align 4
+  store i8 %v, i8 addrspace(1)* %r, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+  %v = load i8, i8 addrspace(2)* %p, align 2
+  store i8 %v, i8 addrspace(1)* %r, align 2
+  ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+  %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
+  %v0 = load i32, i32 addrspace(2)* %p, align 4
+  %v1 = load i32, i32 addrspace(2)* %gep0, align 4
+
+  %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
+  store i32 %v0, i32 addrspace(1)* %r, align 4
+  store i32 %v1, i32 addrspace(1)* %gep1, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
new file mode 100644
index 000000000000..4902e9a3cafb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; We may have subregister live ranges that are undefined on some paths. The
+; verifier should not complain about this.
+
+
+; CHECK-LABEL: {{^}}func:
+define void @func() #0 {
+B0:
+  br i1 undef, label %B1, label %B2
+
+B1:
+  br label %B2
+
+B2:
+  %v0 = phi <4 x float> [ zeroinitializer, %B1 ], [ <float 0.0, float 0.0, float 0.0, float undef>, %B0 ]
+  br i1 undef, label %B30.1, label %B30.2
+
+B30.1:
+  %sub = fsub <4 x float> %v0, undef
+  br label %B30.2
+
+B30.2:
+  %v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v0, %B2 ]
+  %ve0 = extractelement <4 x float> %v3, i32 0
+  store float %ve0, float addrspace(3)* undef, align 4
+  ret void
+}
+
+; FIXME: Extra undef subregister copy should be removed before
+; overwritten with defined copy
+; CHECK-LABEL: {{^}}valley_partially_undef_copy:
+define amdgpu_ps float @valley_partially_undef_copy() #0 {
+bb:
+  %tmp = load volatile i32, i32 addrspace(1)* undef, align 4
+  %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp5 = extractelement <4 x float> %tmp4, i32 0
+  %tmp6 = fmul float %tmp5, undef
+  %tmp7 = fadd float %tmp6, %tmp6
+  %tmp8 = insertelement <4 x i32> %tmp2, i32 %tmp, i32 1
+  store <4 x i32> %tmp8, <4 x i32> addrspace(1)* undef, align 16
+  store float %tmp7, float addrspace(1)* undef, align 4
+  br label %bb9
+
+bb9:                                              ; preds = %bb9, %bb
+  %tmp10 = icmp eq i32 %tmp, 0
+  br i1 %tmp10, label %bb9, label %bb11
+
+bb11:                                             ; preds = %bb9
+  store <4 x i32> %tmp2, <4 x i32> addrspace(1)* undef, align 16
+  ret float undef
+}
+
+; FIXME: Should be able to remove the undef copies
+
+; CHECK-LABEL: {{^}}partially_undef_copy:
+; CHECK: v_mov_b32_e32 v5, 5
+; CHECK: v_mov_b32_e32 v6, 6
+
+; CHECK: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
+
+; Undef copy
+; CHECK: v_mov_b32_e32 v1, v6
+
+; undef copy
+; CHECK: v_mov_b32_e32 v2, v7
+
+; CHECK: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
+; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
+define void @partially_undef_copy() #0 {
+  %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
+  %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
+
+  %partially.undef.0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %partially.undef.1 = insertelement <4 x i32> %partially.undef.0, i32 %tmp1, i32 0
+
+  store volatile <4 x i32> %partially.undef.1, <4 x i32> addrspace(1)* undef, align 16
+  tail call void asm sideeffect "v_nop", "v={VGPR5_VGPR6_VGPR7_VGPR8}"(<4 x i32> %partially.undef.0)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index 036a7e91b47f..f09f73c38b4c 100644
--- a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -1,8 +1,6 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
 ; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+; XUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
 
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
@@ -41,8 +39,9 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 ; COMMON-LABEL: {{^}}branch_false:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
 define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
@@ -77,8 +76,9 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 ; COMMON-LABEL: {{^}}branch_undef:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
 define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
new file mode 100644
index 000000000000..93a2c6998be4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; This used to raise an assertion due to how the choice between uniform and
+; non-uniform branches was determined.
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_cbranch_vccnz
+define amdgpu_ps float @main(<4 x i32> inreg %rsrc) {
+main_body:
+  %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 true, i1 false)
+  %cc = fcmp une float %v, 1.000000e+00
+  br i1 %cc, label %if, label %else
+
+if:
+  %u = fadd float %v, %v
+  br label %else
+
+else:
+  %r = phi float [ %v, %main_body ], [ %u, %if ]
+  ret float %r
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
new file mode 100644
index 000000000000..ac9e2b5f8432
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -0,0 +1,439 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI-LABEL: {{^}}uniform_if_scc:
+; SI-DAG: s_cmp_eq_i32 s{{[0-9]+}}, 0
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %else
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_vcc:
+; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
+; also scheduled the write first.
+; SI-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: s_and_b64 vcc, exec, [[COND]]
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = fcmp oeq float %cond, 0.0
+  br i1 %cmp0, label %if, label %else
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_swap_br_targets_scc:
+; SI-DAG: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
+; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
+; also scheduled the write first.
+; SI-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: s_and_b64 vcc, exec, [[COND]]
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = fcmp oeq float %cond, 0.0
+  br i1 %cmp0, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_move_valu:
+; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
+; Using a floating-point value in an integer compare will cause the compare to
+; be selected for the SALU and then later moved to the VALU.
+; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
+; SI: s_and_b64 vcc, exec, [[COND]]
+; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: buffer_store_dword
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+entry:
+  %a.0 = fadd float %a, 10.0
+  %cond = bitcast float %a.0 to i32
+  %cmp = icmp eq i32 %cond, 5
+  br i1 %cmp, label %if, label %endif
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_move_valu_commute:
+; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
+; Using a floating-point value in an integer compare will cause the compare to
+; be selected for the SALU and then later moved to the VALU.
+; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
+; SI: s_and_b64 vcc, exec, [[COND]]
+; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: buffer_store_dword
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+entry:
+  %a.0 = fadd float %a, 10.0
+  %cond = bitcast float %a.0 to i32
+  %cmp = icmp ugt i32 %cond, 5
+  br i1 %cmp, label %if, label %endif
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+
+; SI-LABEL: {{^}}uniform_if_else_ret:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: s_endpgm
+
+; SI: {{^}}[[IF_LABEL]]:
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: s_endpgm
+define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32 addrspace(1)* %out
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 2, i32 addrspace(1)* %out
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_else:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: [[IF_LABEL]]:
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+
+; SI: [[ENDIF_LABEL]]:
+; SI: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
+; SI: buffer_store_dword [[THREE]]
+; SI: s_endpgm
+define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32 addrspace(1)* %out0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 2, i32 addrspace(1)* %out0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  store i32 3, i32 addrspace(1)* %out1
+  ret void
+}
+
+; SI-LABEL: {{^}}icmp_2_users:
+; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
+; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
+; SI: buffer_store_dword
+; SI: [[LABEL]]:
+; SI: s_endpgm
+define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+main_body:
+  %0 = icmp sgt i32 %cond, 0
+  %1 = sext i1 %0 to i32
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  store i32 %1, i32 addrspace(1)* %out
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  ret void
+}
+
+; SI-LABEL: {{^}}icmp_users_different_blocks:
+; SI: s_load_dword [[COND:s[0-9]+]]
+; SI: s_cmp_lt_i32 [[COND]], 1
+; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
+; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
+; SI: s_and_b64 vcc, exec, [[MASK]]
+; SI: s_cbranch_vccnz [[EXIT]]
+; SI: buffer_store
+; SI: {{^}}[[EXIT]]:
+; SI: s_endpgm
+define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %cmp0 = icmp sgt i32 %cond0, 0
+  %cmp1 = icmp sgt i32 %cond1, 0
+  br i1 %cmp0, label %bb2, label %bb9
+
+bb2:                                              ; preds = %bb
+  %tmp2 = sext i1 %cmp1 to i32
+  %tmp3 = add i32 %tmp2, %tmp
+  br i1 %cmp1, label %bb9, label %bb7
+
+bb7:                                              ; preds = %bb5
+  store i32 %tmp3, i32 addrspace(1)* %out
+  br label %bb9
+
+bb9:                                              ; preds = %bb8, %bb4
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_loop:
+; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
+; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we
+;        get s_add_i32 here.
+; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]]
+; SI: s_and_b64 vcc, exec, vcc
+; SI: s_cbranch_vccnz [[LOOP_LABEL]]
+; SI: s_endpgm
+define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.i, %loop]
+  %i.i = add i32 %i, 1
+  %cmp = icmp eq i32 %a, %i.i
+  br i1 %cmp, label %done, label %loop
+
+done:
+  ret void
+}
+
+; Test uniform and divergent.
+
+; SI-LABEL: {{^}}uniform_inside_divergent:
+; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cmp_lg_i32 {{s[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[ENDIF_LABEL]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %endif
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  %u_cmp = icmp eq i32 %cond, 0
+  br i1 %u_cmp, label %if_uniform, label %endif
+
+if_uniform:
+  store i32 1, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; SI-LABEL: {{^}}divergent_inside_uniform:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %u_cmp = icmp eq i32 %cond, 0
+  br i1 %u_cmp, label %if, label %endif
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if_uniform, label %endif
+
+if_uniform:
+  store i32 1, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; SI-LABEL: {{^}}divergent_if_uniform_if:
+; SI: v_cmp_eq_i32_e32 vcc, 0, v0
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: s_or_b64 exec, exec, [[MASK]]
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: [[EXIT]]:
+; SI: s_endpgm
+define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+  %d_cmp = icmp eq i32 %tid, 0
+  br i1 %d_cmp, label %if, label %endif
+
+if:
+  store i32 1, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  %u_cmp = icmp eq i32 %cond, 0
+  br i1 %u_cmp, label %if_uniform, label %exit
+
+if_uniform:
+  store i32 2, i32 addrspace(1)* %out
+  br label %exit
+
+exit:
+  ret void
+}
+
+; The condition of the branches in the two blocks are
+; uniform. MachineCSE replaces the 2nd condition with the inverse of
+; the first, leaving an scc use in a different block than it was
+; defed.
+
+; SI-LABEL: {{^}}cse_uniform_condition_different_blocks:
+; SI: s_load_dword [[COND:s[0-9]+]]
+; SI: s_cmp_lt_i32 [[COND]], 1
+; SI: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3
+
+; SI: BB#1:
+; SI-NOT: cmp
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_cbranch_scc1 BB[[FNNUM]]_3
+
+; SI: BB[[FNNUM]]_3:
+; SI: s_endpgm
+define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp1 = icmp sgt i32 %cond, 0
+  br i1 %tmp1, label %bb2, label %bb9
+
+bb2:                                              ; preds = %bb
+  %tmp3 = load volatile i32, i32 addrspace(1)* undef
+  store volatile i32 0, i32 addrspace(1)* undef
+  %tmp9 = icmp sle i32 %cond, 0
+  br i1 %tmp9, label %bb9, label %bb7
+
+bb7:                                              ; preds = %bb5
+  store i32 %tmp3, i32 addrspace(1)* %out
+  br label %bb9
+
+bb9:                                              ; preds = %bb8, %bb4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/uniform-crash.ll b/test/CodeGen/AMDGPU/uniform-crash.ll
new file mode 100644
index 000000000000..cfbb2af58677
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}icmp_2_users:
+; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
+; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: [[LABEL]]:
+; GCN-NEXT: s_endpgm
+define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+main_body:
+  %0 = icmp sgt i32 %cond, 0
+  %1 = sext i1 %0 to i32
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  store i32 %1, i32 addrspace(1)* %out
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  ret void
+}
+
+; GCN-LABEL: {{^}}fix_sgpr_live_ranges_crash:
+; GCN: s_cbranch_scc1 [[BB0:[A-Z0-9_]+]]
+; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
+; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN: {{^}}[[BB0]]:
+define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+bb:
+  %cnd = trunc i32 %arg to i1
+  br i1 %cnd, label %bb2, label %bb5
+
+bb2:                                              ; preds = %bb
+  %tmp = mul i32 10, %arg1
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb2
+  %val = load volatile i32, i32 addrspace(2)* undef
+  %tmp4 = icmp eq i32 %val, %arg1
+  br i1 %tmp4, label %bb5, label %bb3
+
+bb5:                                              ; preds = %bb3, %bb
+  %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #1
+  %tmp10 = icmp ult i32 %tmp6, %arg
+  br i1 %tmp10, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb11, %bb5
+  br i1 undef, label %bb11, label %bb12
+
+bb12:                                             ; preds = %bb11, %bb5
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
new file mode 100644
index 000000000000..9f2f0d67d245
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
+
+; Test a simple uniform loop that lives inside non-uniform control flow.
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cmp_ne_i32_e32 vcc, 0
+; CHECK: s_and_saveexec_b64
+
+; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
+
+; CHECK: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
+main_body:
+  %cc = icmp eq i32 %p, 0
+  br i1 %cc, label %out, label %loop_body
+
+loop_body:
+  %counter = phi i32 [ 0, %main_body ], [ %incr, %loop_body ]
+
+  ; Prevent the loop from being optimized out
+  call void asm sideeffect "", "" ()
+
+  %incr = add i32 %counter, 1
+  %lc = icmp sge i32 %incr, 1000
+  br i1 %lc, label %out, label %loop_body
+
+out:
+  ret void
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: s_and_saveexec_b64
+;CHECK: s_xor_b64
+;CHECK-NEXT: s_cbranch_execz
+define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %cc = icmp eq i32 %tid, 0
+  br i1 %cc, label %done1, label %if
+
+if:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %done0, label %loop_body
+
+loop_body:
+  %counter = phi i32 [ 0, %if ], [0, %done0], [ %incr, %loop_body ]
+
+  ; Prevent the loop from being optimized out
+  call void asm sideeffect "", "" ()
+
+  %incr = add i32 %counter, 1
+  %lc = icmp sge i32 %incr, 1000
+  br i1 %lc, label %done1, label %loop_body
+
+done0:
+  %cmp0 = icmp eq i32 %b, 0
+  br i1 %cmp0, label %done1, label %loop_body
+
+done1:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
new file mode 100644
index 000000000000..941f4c601e34
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=amdgcn -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+
+; Should not crash when the processor is not recognized and the
+; wavefront size feature not set.
+
+; Should also not have fragments of r600 and gcn isa mixed.
+
+; ERROR: 'unknown' is not a recognized processor for this target (ignoring processor)
+
+; GCN-NOT: MOV
+; GCN: buffer_store_dword
+; GCN: ScratchSize: 8{{$}}
+
+; R600: MOV
+define void @foo() {
+  %alloca = alloca i32, align 4
+  store volatile i32 0, i32* %alloca
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 87b925a24a04..0435ed4d5525 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
 declare double @llvm.fma.f64(double, double, double) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
 
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
@@ -28,10 +28,10 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -68,10 +68,10 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -82,10 +82,10 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -107,7 +107,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
@@ -118,11 +118,11 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 ; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
-  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
-  store i32 %fma, i32 addrspace(1)* %out, align 4
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+  %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
+  store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
@@ -227,7 +227,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
 
@@ -254,7 +254,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 
 ; Same zero component is re-used for half of each immediate.
 ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index c368c5aaf7dc..ca6bff4f6fc8 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
 ; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; All nan values are converted to 0xffffffff
 ; SI: s_endpgm
 define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
-  %idx = call i32 @llvm.r600.read.tidig.x() #1
+  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %fptr
   %setcc = icmp ne i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index a4eaec3403c9..027c63817903 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -2,19 +2,19 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}mac_vvv:
-; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4
-; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
+; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) {
+define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
 
-  %a = load float, float addrspace(1)* %in
-  %b = load float, float addrspace(1)* %b_ptr
-  %c = load float, float addrspace(1)* %c_ptr
+  %a = load volatile float, float addrspace(1)* %in
+  %b = load volatile float, float addrspace(1)* %b_ptr
+  %c = load volatile float, float addrspace(1)* %c_ptr
 
   %tmp0 = fmul float %a, %b
   %tmp1 = fadd float %tmp0, %c
@@ -24,8 +24,8 @@ entry:
 
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
-; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) {
+; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
+define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -36,7 +36,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) {
+define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -51,7 +51,7 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
+define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -64,18 +64,18 @@ entry:
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) {
+define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
   %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
   %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
 
-  %a = load float, float addrspace(1)* %in
-  %b = load float, float addrspace(1)* %b_ptr
-  %c = load float, float addrspace(1)* %c_ptr
-  %d = load float, float addrspace(1)* %d_ptr
-  %e = load float, float addrspace(1)* %e_ptr
+  %a = load volatile float, float addrspace(1)* %in
+  %b = load volatile float, float addrspace(1)* %b_ptr
+  %c = load volatile float, float addrspace(1)* %c_ptr
+  %d = load volatile float, float addrspace(1)* %d_ptr
+  %e = load volatile float, float addrspace(1)* %e_ptr
 
   %tmp0 = fmul float %a, %b
   %tmp1 = fadd float %tmp0, %c
@@ -104,6 +104,46 @@ entry:
   %b = load float, float addrspace(1)* %b_ptr
   %c = load float, float addrspace(1)* %c_ptr
 
+  %neg_a = fsub float -0.0, %a
+  %tmp0 = fmul float %neg_a, %b
+  %tmp1 = fadd float %tmp0, %c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}unsafe_mad_sub0_src0:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @unsafe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
+  %neg_a = fsub float 0.0, %a
+  %tmp0 = fmul float %neg_a, %b
+  %tmp1 = fadd float %tmp0, %c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}safe_mad_sub0_src0:
+; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
+; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
+define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
   %neg_a = fsub float 0.0, %a
   %tmp0 = fmul float %neg_a, %b
   %tmp1 = fadd float %tmp0, %c
@@ -124,6 +164,26 @@ entry:
   %b = load float, float addrspace(1)* %b_ptr
   %c = load float, float addrspace(1)* %c_ptr
 
+  %neg_b = fsub float -0.0, %b
+  %tmp0 = fmul float %a, %neg_b
+  %tmp1 = fadd float %tmp0, %c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}unsafe_mad_sub0_src1:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @unsafe_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
   %neg_b = fsub float 0.0, %b
   %tmp0 = fmul float %a, %neg_b
   %tmp1 = fadd float %tmp0, %c
@@ -144,7 +204,7 @@ entry:
   %b = load float, float addrspace(1)* %b_ptr
   %c = load float, float addrspace(1)* %c_ptr
 
-  %neg_c = fsub float 0.0, %c
+  %neg_c = fsub float -0.0, %c
   %tmp0 = fmul float %a, %b
   %tmp1 = fadd float %tmp0, %neg_c
 
@@ -152,4 +212,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 1cbefba60c95..02a1278f76cb 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -1,15 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: @test_if
 ; Make sure the i1 values created by the cfg structurizer pass are
 ; moved using VALU instructions
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
-  switch i32 %a, label %default [
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  switch i32 %tid, label %default [
     i32 0, label %case0
     i32 1, label %case1
   ]
@@ -25,7 +26,7 @@ case1:
   br label %end
 
 default:
-  %cmp8 = icmp eq i32 %a, 2
+  %cmp8 = icmp eq i32 %tid, 2
   %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
   br i1 %cmp8, label %if, label %else
 
@@ -54,7 +55,7 @@ end:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
 define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %store, label %exit
 
@@ -71,22 +72,22 @@ exit:
 ; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz BB2_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: ; BB#1:
 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; SI: BB2_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: v_cmp_eq_i32_e32 vcc,
-; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
-; SI: s_cbranch_execnz BB2_3
+; SI-DAG: s_and_b64 vcc, exec, vcc
+; SI: s_cbranch_vccz [[LABEL_LOOP]]
+; SI: [[LABEL_EXIT]]:
+; SI: s_endpgm
 
 define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
-  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   %limit = add i32 %tid, 64
   br i1 %is.0, label %loop, label %exit
@@ -114,7 +115,7 @@ exit:
 ; SI: v_cmp_lt_i32_e32 vcc
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz BB3_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; Initialize inner condition to false
 ; SI: ; BB#1:
@@ -122,7 +123,7 @@ exit:
 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
 
 ; Clear exec bits for workitems that load -1s
-; SI: BB3_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword [[B:v[0-9]+]]
 ; SI: buffer_load_dword [[A:v[0-9]+]]
 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
@@ -130,29 +131,29 @@ exit:
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
 ; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]]
-; SI: s_cbranch_execz BB3_5
+; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
 
-; SI: BB#4:
+; SI: BB#3:
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]]
-; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]]
+; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
+; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
-; SI: BB3_5:
+; SI: [[LABEL_FLOW]]:
 ; SI: s_or_b64 exec, exec, [[ORNEG2]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
 ; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz BB3_3
+; SI: s_cbranch_execnz [[LABEL_LOOP]]
 
-; SI: BB#6
+; SI: BB#5
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
 
-; SI: BB3_2:
+; SI: [[LABEL_EXIT]]:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
 define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
-  %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
   %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 6f3b4847fbdf..c151ca9ef9b4 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll
new file mode 100644
index 000000000000..2d39f82e2499
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that when extracting the same unknown vector index from an
+; insertelement the dynamic indexing is folded away.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; No dynamic indexing required
+; GCN-LABEL: {{^}}extract_insert_same_dynelt_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
+  %extract = extractelement <4 x i32> %insert, i32 %idx
+  store i32 %extract, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
+; GCN: buffer_load_dwordx4
+; GCN: v_movreld_b32
+; GCN: v_movrels_b32
+; GCN: buffer_store_dword v
+define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx0
+  %extract = extractelement <4 x i32> %insert, i32 %idx1
+  store i32 %extract, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_same_elt2_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
+  %extract = extractelement <4 x i32> %insert, i32 %idx
+  store i32 %extract, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_same_dynelt_v4f32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
+  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %id.ext
+  %vec = load volatile <4 x float>, <4 x float> addrspace(1)* %gep.in
+  %insert = insertelement <4 x float> %vec, float %val, i32 %idx
+  %extract = extractelement <4 x float> %insert, i32 %idx
+  store float %extract, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index cd7c78f408dd..6c33bc98c605 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
-; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -11,24 +11,33 @@
 
 ; FIXME: The same register is initialized to 0 for every spill.
 
-declare i32 @llvm.r600.read.tgid.x() #1
-declare i32 @llvm.r600.read.tgid.y() #1
-declare i32 @llvm.r600.read.tgid.z() #1
-
 ; GCN-LABEL: {{^}}spill_vgpr_compute:
 
-; GCN: s_mov_b32 s16, s3
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: workitem_private_segment_byte_size = 1024
+
+; GCN-NOT: flat_scr
+
+; GCNMESA-DAG: s_mov_b32 s16, s3
+; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCNMESA-DAG: s_mov_b32 s14, -1
+; SIMESA-DAG: s_mov_b32 s15, 0xe8f000
+; VIMESA-DAG: s_mov_b32 s15, 0xe80000
 
 
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
@@ -175,7 +184,8 @@ bb12:                                             ; preds = %bb145, %bb
   %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
   %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
   %tmp142 = bitcast float %tmp95 to i32
-  %tmp143 = icmp sgt i32 %tmp142, 125
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tmp143 = icmp sgt i32 %tmp142, %tid
   br i1 %tmp143, label %bb144, label %bb145
 
 bb144:                                            ; preds = %bb12
@@ -581,5 +591,7 @@ bb145:                                            ; preds = %bb12
   br label %bb12
 }
 
+declare i32 @llvm.r600.read.tidig.x() #1
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 16abb89bb0b8..7d97777a78bd 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -8,22 +8,25 @@
 ; intermediate register class copies.
 
 ; FIXME: The same register is initialized to 0 for every spill.
+; FIXME: The unused arguments are removed
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; GCN-DAG: s_mov_b32 s13, s12
+; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s18, -1
+; SI-DAG: s_mov_b32 s19, 0xe8f000
+; VI-DAG: s_mov_b32 s19, 0xe80000
 
-; s12 is offset user SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; s13 is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
 
-define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0
   %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
@@ -172,7 +175,8 @@ bb24:                                             ; preds = %bb157, %bb
   %tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ]
   %tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ]
   %tmp154 = bitcast float %tmp107 to i32
-  %tmp155 = icmp sgt i32 %tmp154, 125
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+  %tmp155 = icmp sgt i32 %tmp154, %tid
   br i1 %tmp155, label %bb156, label %bb157
 
 bb156:                                            ; preds = %bb24
@@ -487,7 +491,9 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
 !0 = !{!1, !1, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
new file mode 100644
index 000000000000..ad7521a3da9b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -0,0 +1,24 @@
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+
+declare float @llvm.amdgcn.rsq.legacy(float) #0
+
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+  %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocation(line: 1, column: 42, scope: !5)
+!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
index 2bfe1b2bd6ec..ae8ec58270c1 100644
--- a/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; Test that we correctly commute a sub instruction
 ; FUNC-LABEL: {{^}}sub_rev:
@@ -10,7 +10,7 @@
 
 define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
-  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+  %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
   br i1 %tmp, label %if, label %else
 
@@ -37,7 +37,7 @@ endif:                                            ; preds = %else, %if
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
 define void @add_fold(float addrspace(1)* %out) {
 entry:
-  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
   %tmp2 = fadd float %tmp1, 1.024000e+03
   store float %tmp2, float addrspace(1)* %out
@@ -45,7 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index dc1f1ea11b01..0cd706b642d7 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -1,29 +1,29 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-;FUNC-LABEL: {{^}}test_select_v2i32:
+; FUNC-LABEL: {{^}}test_select_v2i32:
 
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
 
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
-  %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
-  %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
-  %cmp = icmp ne <2 x i32> %0, %1
-  %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
+  %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
+  %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
+  %cmp = icmp sgt <2 x i32> %load0, %load1
+  %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_select_v2f32:
+; FUNC-LABEL: {{^}}test_select_v2f32:
 
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
@@ -40,24 +40,24 @@ entry:
 
 ;FUNC-LABEL: {{^}}test_select_v4i32:
 
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[4].X
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].W
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
 
 ; FIXME: The shrinking does not happen on tonga
 
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
-  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
-  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
-  %cmp = icmp ne <4 x i32> %0, %1
-  %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+  %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
+  %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
+  %cmp = icmp sgt <4 x i32> %load0, %load1
+  %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 107e84b33be9..265774180a7f 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -11,14 +11,14 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
   %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
   %tmp12 = extractelement <4 x float> %tmp11, i32 0
   %tmp13 = extractelement <4 x float> %tmp11, i32 1
-  call void @llvm.AMDGPU.barrier.global() #1
+  call void @llvm.amdgcn.s.barrier() #1
   %tmp14 = extractelement <4 x float> %tmp11, i32 2
 ;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
   %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
@@ -45,8 +45,8 @@ main_body:
 ; ILPMAX: s_waitcnt vmcnt(0)
 ; ILPMAX: s_endpgm
 
-define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
-byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
+byval, i32 inreg, i32 inreg, i32, i32, i32, i32) {
 main_body:
   %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
   %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
@@ -71,14 +71,13 @@ main_body:
 
 
 ; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.global() #1
+declare void @llvm.amdgcn.s.barrier() #1
 
 ; Function Attrs: nounwind readnone
 declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="1" }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
 
diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll
new file mode 100644
index 000000000000..38dbf2794fc5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=GCN %s
+
+; If flat_store_dword and flat_load_dword use different registers for the data
+; operand, this test is not broken.  It just means it is no longer testing
+; for the original bug.
+
+; GCN: {{^}}test:
+; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
+; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
+define void @test(i32 addrspace(1)* %out, i32 %in) {
+  store volatile i32 0, i32 addrspace(1)* %out
+  %val = load volatile i32, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
new file mode 100644
index 000000000000..23b0ffd5b3da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -0,0 +1,366 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+; Check that WQM isn't triggered by image load/store intrinsics.
+;
+;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_wqm
+define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+; Check that WQM is triggered by image samples and left untouched for loads...
+;
+;CHECK-LABEL: {{^}}test2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK-NOT: exec
+;CHECK: _load_dword v0,
+define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+main_body:
+  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
+  %c.3 = extractelement <4 x i32> %c.2, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
+  %data = load float, float addrspace(1)* %gep
+  ret float %data
+}
+
+; ... but disabled for stores (and, in this simple case, not re-enabled).
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK-NOT: exec
+;CHECK: .size test3
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
+  %wr = extractelement <4 x float> %tex, i32 1
+  store float %wr, float addrspace(1)* %gep
+  ret <4 x float> %tex
+}
+
+; Check that WQM is re-enabled when required.
+;
+;CHECK-LABEL: {{^}}test4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
+main_body:
+  %c.1 = mul i32 %c, %d
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
+  store float %data, float addrspace(1)* %gep
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Check a case of one branch of an if-else requiring WQM, the other requiring
+; exact.
+;
+; Note: In this particular case, the save-and-restore could be avoided if the
+; analysis understood that the two branches of the if-else are mutually
+; exclusive.
+;
+;CHECK-LABEL: {{^}}test_control_flow_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVED]]
+;CHECK: %IF
+;CHECK: image_sample
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ELSE
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Reverse branch order compared to the previous test.
+;
+;CHECK-LABEL: {{^}}test_control_flow_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: image_sample
+;CHECK: %Flow
+;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
+;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
+;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
+;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
+;CHECK-NEXT: ; BB#3: ; %ELSE
+;CHECK: store_dword
+;CHECK: [[END_BB]]: ; %END
+;CHECK: s_or_b64 exec, exec,
+;CHECK: v_mov_b32_e32 v0
+;CHECK: ; return
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %ELSE, label %IF
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Check that branch conditions are properly marked as needing WQM...
+;
+;CHECK-LABEL: {{^}}test_control_flow_2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: load
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmp
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+main_body:
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  ; The load that determines the branch (and should therefore be WQM) is
+  ; surrounded by stores that require disabled WQM.
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %coord.IF = mul i32 %coord, 3
+  br label %END
+
+ELSE:
+  %coord.ELSE = mul i32 %coord, 4
+  br label %END
+
+END:
+  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; ... but only if they really do need it.
+;
+;CHECK-LABEL: {{^}}test_control_flow_3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: load
+;CHECK: store
+;CHECK: v_cmp
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = extractelement <4 x float> %tex, i32 0
+
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %tex.IF = fmul float %tex.1, 3.0
+  br label %END
+
+ELSE:
+  %tex.ELSE = fmul float %tex.1, 4.0
+  br label %END
+
+END:
+  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
+  ret float %tex.END
+}
+
+; Another test that failed at some point because of terminator handling.
+;
+;CHECK-LABEL: {{^}}test_control_flow_4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: load
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: %END
+;CHECK: image_sample
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
+main_body:
+  %cond = icmp eq i32 %y, 0
+  br i1 %cond, label %IF, label %END
+
+IF:
+  %data = load float, float addrspace(1)* %ptr
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Kill is performed in WQM mode so that uniform kill behaves correctly ...
+;
+;CHECK-LABEL: {{^}}test_kill_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmpx_
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: image_sample
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %idx.0 = extractelement <2 x i32> %idx, i32 0
+  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
+  %data.0 = extractelement <2 x float> %data, i32 0
+  store float %data.0, float addrspace(1)* %gep.0
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  %idx.1 = extractelement <2 x i32> %idx, i32 1
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 1
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %out = fadd <4 x float> %tex, %tex2
+
+  ret <4 x float> %out
+}
+
+; ... but only if WQM is necessary.
+;
+; CHECK-LABEL: {{^}}test_kill_1:
+; CHECK-NEXT: ; %main_body
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; SI: buffer_store_dword
+; VI: flat_store_dword
+; CHECK-NOT: wqm
+; CHECK: v_cmpx_
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+  store float %data, float addrspace(1)* %gep
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  ret <4 x float> %tex
+}
+
+; Check prolog shaders.
+;
+; CHECK-LABEL: {{^}}test_prolog_1:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_add_f32_e32 v0,
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+main_body:
+  %s = fadd float %a, %b
+  ret float %s
+}
+
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }
+attributes #4 = { "amdgpu-ps-wqm-outputs" }
diff --git a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
new file mode 100644
index 000000000000..deac809f9b05
--- /dev/null
+++ b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -0,0 +1,22 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+
+; write_register doesn't prevent us from illegally trying to write a
+; vgpr value into a scalar register, but I don't think there's much we
+; can do to avoid this.
+
+declare void @llvm.write_register.i32(metadata, i32) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+
+define void @write_vgpr_into_sgpr() {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void @llvm.write_register.i32(metadata !0, i32 %tid)
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!0 = !{!"exec_lo"}
diff --git a/test/CodeGen/AMDGPU/write_register.ll b/test/CodeGen/AMDGPU/write_register.ll
new file mode 100644
index 000000000000..88660ba6ec6a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/write_register.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+
+declare void @llvm.write_register.i32(metadata, i32) #0
+declare void @llvm.write_register.i64(metadata, i64) #0
+
+; CHECK-LABEL: {{^}}test_write_m0:
+define void @test_write_m0(i32 %val) #0 {
+  call void @llvm.write_register.i32(metadata !0, i32 0)
+  call void @llvm.write_register.i32(metadata !0, i32 -1)
+  call void @llvm.write_register.i32(metadata !0, i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec:
+; CHECK: s_mov_b64 exec, 0
+; CHECK: s_mov_b64 exec, -1
+; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_write_exec(i64 %val) #0 {
+  call void @llvm.write_register.i64(metadata !1, i64 0)
+  call void @llvm.write_register.i64(metadata !1, i64 -1)
+  call void @llvm.write_register.i64(metadata !1, i64 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch:
+; CHECK: s_mov_b64 flat_scratch, 0
+; CHECK: s_mov_b64 flat_scratch, -1
+; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_write_flat_scratch(i64 %val) #0 {
+  call void @llvm.write_register.i64(metadata !2, i64 0)
+  call void @llvm.write_register.i64(metadata !2, i64 -1)
+  call void @llvm.write_register.i64(metadata !2, i64 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
+; CHECK: s_mov_b32 flat_scratch_lo, 0
+; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
+define void @test_write_flat_scratch_lo(i32 %val) #0 {
+  call void @llvm.write_register.i32(metadata !3, i32 0)
+  call void @llvm.write_register.i32(metadata !3, i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
+; CHECK: s_mov_b32 flat_scratch_hi, 0
+; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
+define void @test_write_flat_scratch_hi(i32 %val) #0 {
+  call void @llvm.write_register.i32(metadata !4, i32 0)
+  call void @llvm.write_register.i32(metadata !4, i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec_lo:
+; CHECK: s_mov_b32 exec_lo, 0
+; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
+define void @test_write_exec_lo(i32 %val) #0 {
+  call void @llvm.write_register.i32(metadata !5, i32 0)
+  call void @llvm.write_register.i32(metadata !5, i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec_hi:
+; CHECK: s_mov_b32 exec_hi, 0
+; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
+define void @test_write_exec_hi(i32 %val) #0 {
+  call void @llvm.write_register.i32(metadata !6, i32 0)
+  call void @llvm.write_register.i32(metadata !6, i32 %val)
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"m0"}
+!1 = !{!"exec"}
+!2 = !{!"flat_scratch"}
+!3 = !{!"flat_scratch_lo"}
+!4 = !{!"flat_scratch_hi"}
+!5 = !{!"exec_lo"}
+!6 = !{!"exec_hi"}
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 8b383e4c393d..7f6b80459047 100644
--- a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -71,7 +71,7 @@ declare i32 @llvm.r600.read.global.size.y() #1
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.global.size.z() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
 !opencl.kernels = !{!0, !1, !2}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 655655d92f08..202170d6e229 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -64,8 +64,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
 define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load i1, i1 addrspace(1)* %in0
-  %b = load i1, i1 addrspace(1)* %in1
+  %a = load volatile i1, i1 addrspace(1)* %in0
+  %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
   store i1 %xor, i1 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index 35ddf2b0a465..c3b76da5f778 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
 
 ; R600: {{^}}test:
 ; R600: MEM_RAT_CACHELESS STORE_RAW
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
new file mode 100644
index 000000000000..842c30b40df2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}zext_or_operand_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+  %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+  %ext = zext i32 %ld.32 to i64
+  %or = or i64 %ld.64, %ext
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}zext_or_operand_commute_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: v[[HI]]
+; GCN-NOT: _or_
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+  %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+  %ext = zext i32 %ld.32 to i64
+  %or = or i64 %ext, %ld.64
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}