aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/AMDGPU/min.ll
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/AMDGPU/min.ll')
-rw-r--r--test/CodeGen/AMDGPU/min.ll303
1 files changed, 242 insertions, 61 deletions
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 19d0117d64a9..e85a724c1567 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,17 +1,22 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
; GCN: v_min_i32_e32
; EG: MIN_INT
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
- %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %a.gep, align 4
+ %b = load i32, i32 addrspace(1)* %b.gep, align 4
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %out, align 4
+ store i32 %val, i32 addrspace(1)* %out.gep, align 4
ret void
}
@@ -19,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; GCN: s_min_i32
; EG: MIN_INT
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -30,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; GCN: s_min_i32
; EG: MIN_INT
-define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
%cmp = icmp sle <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -47,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
-define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
%cmp = icmp sle <4 x i32> %a, %b
%val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
store <4 x i32> %val, <4 x i32> addrspace(1)* %out
@@ -60,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
; GCN: s_sext_i32_i8
; GCN: s_sext_i32_i8
; GCN: s_min_i32
-define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
%cmp = icmp sle i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
store i8 %val, i8 addrspace(1)* %out
@@ -90,30 +95,62 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
; VI: v_min_i32
; VI: v_min_i32
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+
; GCN: s_endpgm
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
-define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
%cmp = icmp sle <4 x i8> %a, %b
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
store <4 x i8> %val, <4 x i8> addrspace(1)* %out
ret void
}
+; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+ %cmp = icmp sle <2 x i16> %a, %b
+ %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+ store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FIXME: VI use s_min_i32
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
; SI: v_min_i32
; SI: v_min_i32
; SI: v_min_i32
; SI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+; GFX9: v_pk_min_i16
+
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
-define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
store <4 x i16> %val, <4 x i16> addrspace(1)* %out
@@ -124,12 +161,36 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
; GCN: v_min_i32_e32
; EG: MIN_INT
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
- %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %a.gep, align 4
+ %b = load i32, i32 addrspace(1)* %b.gep, align 4
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %out, align 4
+ store i32 %val, i32 addrspace(1)* %out.gep, align 4
+ ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i16
+; SI: v_min_i32_e32
+
+; GFX89: v_min_i16_e32
+
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
+ %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
+ %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+
+ %a = load i16, i16 addrspace(1)* %a.gep
+ %b = load i16, i16 addrspace(1)* %b.gep
+ %cmp = icmp slt i16 %a, %b
+ %val = select i1 %cmp, i16 %a, i16 %b
+ store i16 %val, i16 addrspace(1)* %out.gep
ret void
}
@@ -137,7 +198,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; GCN: s_min_i32
; EG: MIN_INT
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -150,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; EG: MIN_INT
; EG: MIN_INT
-define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
store <2 x i32> %val, <2 x i32> addrspace(1)* %out
@@ -161,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
%cmp = icmp slt i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -172,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
%cmp = icmp sle i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -183,12 +244,16 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; GCN: v_min_u32_e32
; EG: MIN_UINT
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
- %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %a.gep, align 4
+ %b = load i32, i32 addrspace(1)* %b.gep, align 4
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %out, align 4
+ store i32 %val, i32 addrspace(1)* %out.gep, align 4
ret void
}
@@ -196,25 +261,65 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; GCN: v_min_u32_e32
; GCN: v_min_u32_e32
; GCN: v_min_u32_e32
-; SI-NOT: v_min_u32_e32
+; GCN-NOT: v_min_u32_e32
; GCN: s_endpgm
; EG: MIN_UINT
; EG: MIN_UINT
; EG: MIN_UINT
-define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
- %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr
- %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr
+define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+
+ %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep
+ %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep
%cmp = icmp ule <3 x i32> %a, %b
%val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
- store <3 x i32> %val, <3 x i32> addrspace(1)* %out
+ store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep
+ ret void
+}
+
+; FIXME: Reduce unused packed component to scalar
+; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI-NOT: v_min_u32_e32
+
+; VI: v_min_u16_e32
+; VI: v_min_u16_sdwa
+; VI: v_min_u16_e32
+; VI-NOT: v_min_u16_e32
+
+; GFX9: v_pk_min_u16
+; GFX9: v_pk_min_u16
+
+; GCN: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+
+ %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep
+ %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep
+ %cmp = icmp ule <3 x i16> %a, %b
+ %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+ store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep
ret void
}
+
; FUNC-LABEL: @s_test_umin_ule_i32
; GCN: s_min_u32
; EG: MIN_UINT
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -225,27 +330,40 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; GCN: v_min_u32_e32
; EG: MIN_UINT
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
- %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %a.gep, align 4
+ %b = load i32, i32 addrspace(1)* %b.gep, align 4
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %out, align 4
+ store i32 %val, i32 addrspace(1)* %out.gep, align 4
ret void
}
; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: v_min_u32_e32
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_min_u32_e32
+
+; GFX89: flat_load_ubyte
+; GFX89: flat_load_ubyte
+; GFX89: v_min_u16_e32
; EG: MIN_UINT
-define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
- %a = load i8, i8 addrspace(1)* %aptr, align 1
- %b = load i8, i8 addrspace(1)* %bptr, align 1
+define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid
+
+ %a = load i8, i8 addrspace(1)* %a.gep, align 1
+ %b = load i8, i8 addrspace(1)* %b.gep, align 1
%cmp = icmp ult i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
- store i8 %val, i8 addrspace(1)* %out, align 1
+ store i8 %val, i8 addrspace(1)* %out.gep, align 1
ret void
}
@@ -253,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
; GCN: s_min_u32
; EG: MIN_UINT
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
store i32 %val, i32 addrspace(1)* %out, align 4
@@ -268,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; GCN: s_endpgm
; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
%a = load i32, i32 addrspace(1)* %aptr, align 4
%b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp ult i32 %a, %b
@@ -286,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
; GCN: s_endpgm
; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
%a = load i16, i16 addrspace(1)* %aptr, align 2
%b = load i16, i16 addrspace(1)* %bptr, align 2
%cmp = icmp ult i16 %a, %b
@@ -301,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace
; GCN: s_min_u32
; EG: MIN_UINT
-define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
%cmp = icmp ult <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -326,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
; EG: MIN_UINT
; EG: MIN_UINT
; EG: MIN_UINT
-define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
%cmp = icmp ult <8 x i32> %a, %b
%val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
store <8 x i32> %val, <8 x i32> addrspace(1)* %out
@@ -334,14 +452,23 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
; EG: MIN_UINT
; EG: MIN_UINT
@@ -351,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
; EG: MIN_UINT
; EG: MIN_UINT
; EG: MIN_UINT
-define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
%cmp = icmp ult <8 x i16> %a, %b
%val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
store <8 x i16> %val, <8 x i16> addrspace(1)* %out
@@ -367,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_UINT
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
%cmp = icmp ult i32 %a.ext, %b.ext
@@ -387,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_INT
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
%cmp = icmp slt i32 %a.ext, %b.ext
@@ -402,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
; GCN: s_min_i32
; EG: MIN_INT
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
%cmp = icmp sle i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %out
@@ -415,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
; EG: MIN_UINT
; EG: MIN_UINT
-define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
%tmp = icmp ult i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, i64 addrspace(1)* %out, align 8
@@ -427,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
; EG: MIN_UINT
; EG: MIN_UINT
-define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
%tmp = icmp ule i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, i64 addrspace(1)* %out, align 8
@@ -439,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
; EG-DAG: MIN_UINT
; EG-DAG: MIN_INT
-define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
%tmp = icmp slt i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, i64 addrspace(1)* %out, align 8
@@ -451,9 +578,63 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
; EG-DAG: MIN_UINT
; EG-DAG: MIN_INT
-define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
%tmp = icmp sle i64 %a, %b
%val = select i1 %tmp, i64 %a, i64 %b
store i64 %val, i64 addrspace(1)* %out, align 8
ret void
}
+
+; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i16
+; VI: v_min_i16
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+ %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+ %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+ %cmp = icmp sle <2 x i16> %a, %b
+ %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+ store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+ ret void
+}
+
+; FIXME: i16 min
+; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16:
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u16
+; VI: v_min_u16
+
+; GFX9: v_pk_min_u16
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+ %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+ %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+ %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+ %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+ %cmp = icmp ule <2 x i16> %a, %b
+ %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+ store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+ ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }