diff options
Diffstat (limited to 'test/CodeGen/AMDGPU')
189 files changed, 2671 insertions, 1681 deletions
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll index bee13d8c17f1..98848295a73b 100644 --- a/test/CodeGen/AMDGPU/add.i16.ll +++ b/test/CodeGen/AMDGPU/add.i16.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -67,7 +67,7 @@ define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] +; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll index 7e4546d2cfb3..6dcd7c234dc6 100644 --- a/test/CodeGen/AMDGPU/add.ll +++ b/test/CodeGen/AMDGPU/add.ll @@ -5,9 +5,9 @@ ;FUNC-LABEL: {{^}}test1: ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} -;SI-NOT: [[REG]] -;SI: buffer_store_dword [[REG]], +;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} +;SI: v_mov_b32_e32 v[[REG]], s[[REG]] +;SI: buffer_store_dword v[[REG]], define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -21,8 +21,8 @@ define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -39,10 +39,10 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index 76f724c2b90b..4baa35ca57c5 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -168,10 +168,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll index 00a125c2e44f..d33965d4dda7 100644 --- a/test/CodeGen/AMDGPU/add_i128.ll +++ b/test/CodeGen/AMDGPU/add_i128.ll @@ -19,10 +19,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 ; Check that the SGPR add operand is correctly moved to a VGPR. ; GCN-LABEL: {{^}}sgpr_operand: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %foo, %a @@ -31,10 +31,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad } ; GCN-LABEL: {{^}}sgpr_operand_reversed: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %a, %foo diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll index 62733d5bfb6c..f673d91192b8 100644 --- a/test/CodeGen/AMDGPU/add_i64.ll +++ b/test/CodeGen/AMDGPU/add_i64.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add ; Check that the SGPR add operand is correctly moved to a VGPR. ; SI-LABEL: {{^}}sgpr_operand: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %foo, %a @@ -32,8 +32,8 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr ; SGPR as other operand. ; ; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %a, %foo diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index b1e71722d80c..a6aa9e795151 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -10,20 +10,22 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base - -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] @@ -48,6 +50,12 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] + ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 @@ -55,12 +63,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] diff --git a/test/CodeGen/AMDGPU/alignbit-pat.ll b/test/CodeGen/AMDGPU/alignbit-pat.ll index ff5c8960fad3..3f07188063cd 100644 --- a/test/CodeGen/AMDGPU/alignbit-pat.ll +++ b/test/CodeGen/AMDGPU/alignbit-pat.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}alignbit_shr_pat: ; GCN-DAG: s_load_dword s[[SHR:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index 0e5605961e10..0c7160df2b96 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 -; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 +; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 ; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}} ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}} - -; CHECK: extractelement <2 x float> %x -; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: extractelement <2 x float> %x -; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: store volatile <2 x float> %arcp.25ulp - -; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x @@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out ; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}} - -; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0 -; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0 -; CHECK: store volatile <2 x float> %arcp.25ulp - -; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0 -; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x @@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace ; FIXME: Should be able to get fdiv for 1.0 component ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 ; CHECK: store volatile <2 x float> %arcp.25ulp -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 @@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 +; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll index 2aec03aff8a3..ef11ae87267e 100644 --- a/test/CodeGen/AMDGPU/and-gcn.ll +++ b/test/CodeGen/AMDGPU/and-gcn.ll @@ -2,8 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 +; SI: s_and_b64 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index c356f8b87cfc..ee0190149e92 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #0 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -26,10 +26,11 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -136,7 +137,9 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrs ; FUNC-LABEL: {{^}}v_and_constant_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -145,7 +148,9 @@ define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrsp ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 64 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -154,7 +159,9 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 a ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, -16 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -239,8 +246,11 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out ; SI: v_and_b32 ; SI: v_and_b32 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 + %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid + %b = load i64, i64 addrspace(1)* %gep.b, align 8 %and = and i64 %a, %b store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -251,7 +261,9 @@ define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1231231234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -299,26 +311,30 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out } ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant: -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]] ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_and_inline_imm_i64: -; SI: buffer_load_dword v{{[0-9]+}} +; SI: {{buffer|flat}}_load_dword v{{[0-9]+}} ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -326,13 +342,15 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr ; FIXME: Should be able to reduce load width ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; SI-NOT: and ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, -8 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -549,5 +567,4 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1 store i64 %and, i64 addrspace(1)* %out, align 8 ret void } - attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index c61c23222bc7..cdc60ab504e0 100644 --- a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32: -; GCN: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dword +; GCN: s_load_dwordx4 +; GCN-DAG: s_load_dwordx4 +; GCN-DAG: s_load_dword ; GCN: {{buffer|flat}}_store_byte ; GCN: {{buffer|flat}}_store_byte diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll index 539373f7bdeb..f29bfb46b94b 100644 --- a/test/CodeGen/AMDGPU/bitreverse.ll +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +declare i32 @llvm.amdgcn.workitem.id.x() #1 + declare i16 @llvm.bitreverse.i16(i16) #1 declare i32 @llvm.bitreverse.i32(i32) #1 declare i64 @llvm.bitreverse.i64(i64) #1 @@ -42,12 +44,14 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) } ; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -66,7 +70,9 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; SI: v_bfrev_b32_e32 ; SI: v_bfrev_b32_e32 define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -82,7 +88,9 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) ; FUNC-LABEL: {{^}}v_brev_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { - %val = load i64, i64 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid + %val = load i64, i64 addrspace(1)* %gep %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -97,7 +105,9 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FUNC-LABEL: {{^}}v_brev_v2i64: define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll index d2dacd7c17b3..eb3fc2fab34f 100644 --- a/test/CodeGen/AMDGPU/bswap.ll +++ b/test/CodeGen/AMDGPU/bswap.ll @@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: s_load_dword [[VAL:s[0-9]+]] ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 5dec3e35ab3d..c114332a5887 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; OPT-LABEL: @test_no_sink_flat_small_offset_i32( ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in @@ -40,7 +40,7 @@ done: ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT: getelementptr i32, i32 addrspace(4)* %out, -; OPT-CI-NOT: getelementptr +; rOPT-CI-NOT: getelementptr ; OPT: br i1 ; OPT-CI: addrspacecast diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index c1cf56e5058e..c01d834bc33d 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index 6ecf75c1acec..90fba0342090 100644 --- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -1,36 +1,4 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s ---- | - define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_max_reg_imm_f32() #0 { - ret void - } - - attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } - -... --- # GCN-LABEL: name: v_max_self_clamp_not_set_f32 # GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec @@ -70,7 +38,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -132,7 +100,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -195,7 +163,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -260,7 +228,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -337,7 +305,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -402,7 +370,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -435,7 +403,7 @@ registers: - { id: 0, class: vgpr_32 } - { id: 1, class: vgpr_32 } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %vgpr0 %0 = COPY %vgpr0 diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll index 3e1b76a1df09..14b798ba822b 100644 --- a/test/CodeGen/AMDGPU/coalescer_remat.ll +++ b/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float) ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 8 +; CHECK: ; NumVgprs: 4 define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index ed78ccc9b617..0401f7b07e21 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -1,84 +1,5 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s ---- | - define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %and = and i32 %a, 1234567 - store volatile i32 %and, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %shl = shl i32 %a, 12 - store volatile i32 %shl, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %shl = shl i32 %a, 12 - store i32 %shl, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %ashr = ashr i32 %a, 12 - store volatile i32 %ashr, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %ashr = ashr i32 %a, 12 - store i32 %ashr, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %lshr = lshr i32 %a, 12 - store volatile i32 %lshr, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %lshr = lshr i32 %a, 12 - store i32 %lshr, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @undefined_vreg_operand() { - unreachable - } - - declare i32 @llvm.amdgcn.workitem.id.x() #1 - - attributes #0 = { nounwind } - attributes #1 = { nounwind readnone } - ... ---- # GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}} # GCN: %10 = V_MOV_B32_e32 1543, implicit %exec @@ -119,11 +40,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 %2 = COPY %1.sub1 %3 = COPY %1.sub0 %4 = S_MOV_B32 61440 @@ -133,7 +54,7 @@ body: | %8 = S_MOV_B32 9999 %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc %10 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -204,12 +125,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec %32 = REG_SEQUENCE %3, 1, %31, 2 %33 = V_LSHLREV_B64 2, killed %32, implicit %exec @@ -223,19 +144,19 @@ body: | %34 = V_MOV_B32_e32 63, implicit %exec %27 = V_AND_B32_e64 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_AND_B32_e64 %24, %26, implicit %exec - FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr %29 = V_AND_B32_e32 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr %30 = V_AND_B32_e64 %26, %26, implicit %exec - FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr %31 = V_AND_B32_e64 %34, %34, implicit %exec - FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -285,11 +206,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 1 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -298,7 +219,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -390,7 +311,7 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 @@ -411,34 +332,34 @@ body: | %27 = S_MOV_B32 -4 %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_LSHL_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_LSHL_B32_e64 12, %7, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_LSHL_B32_e64 12, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_LSHL_B32_e64 %6, 12, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_LSHL_B32_e64 %6, 32, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_LSHL_B32_e32 %6, %6, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_LSHL_B32_e32 %27, %6, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -485,11 +406,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 999123 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -498,7 +419,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -593,12 +514,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec %16 = REG_SEQUENCE %2, 1, %15, 2 %17 = V_LSHLREV_B64 2, killed %16, implicit %exec @@ -619,34 +540,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_ASHR_I32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_ASHR_I32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_ASHR_I32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_ASHR_I32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_ASHR_I32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_ASHR_I32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_ASHR_I32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -693,11 +614,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 -999123 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -706,7 +627,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -802,12 +723,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec %16 = REG_SEQUENCE %2, 1, %15, 2 %17 = V_LSHLREV_B64 2, killed %16, implicit %exec @@ -828,34 +749,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_LSHR_B32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_LSHR_B32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_LSHR_B32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_LSHR_B32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_LSHR_B32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_LSHR_B32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_LSHR_B32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 8611cd080e15..09d4b2c8bd77 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll index d772d1b67936..e39bd60a1cc8 100644 --- a/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; FUNC-LABEL: {{^}}test_copy_v4i8: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -41,14 +47,16 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x } ; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -57,7 +65,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: v_lshrrev_b32 ; GCN: v_and_b32 ; GCN: v_or_b32 @@ -66,7 +74,9 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 @@ -97,19 +107,21 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o } ; FUNC-LABEL: {{^}}test_copy_v3i8_align4: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { - %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v3i8_align2: -; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm @@ -120,9 +132,9 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}test_copy_v3i8_align1: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte @@ -135,10 +147,10 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { @@ -148,10 +160,10 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* % } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: buffer_store_byte diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll index 149c50685b1d..a544cbe890b5 100644 --- a/test/CodeGen/AMDGPU/ctlz.ll +++ b/test/CodeGen/AMDGPU/ctlz.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone @@ -34,9 +34,9 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) } ; FUNC-LABEL: {{^}}v_ctlz_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], +; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] +; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -44,14 +44,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -62,14 +64,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -90,16 +94,25 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; EG-DAG: FFBH_UINT ; EG-DAG: CNDE_INT define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], -; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], +; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] +; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]] + +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc + +; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]] +; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]] ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { @@ -136,12 +149,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; FUNC-LABEL: {{^}}v_ctlz_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] -; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}} @@ -168,12 +181,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -182,12 +197,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -197,13 +214,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out ; TODO: Should be able to eliminate select here as well. ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %ctlz, 32 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -212,13 +231,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %ctlz, 32 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -242,7 +263,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1: -; SI: buffer_load_ushort [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; SI: buffer_store_short [[FFBH]], define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 48f3e4401f1a..7500da536307 100644 --- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,21 +29,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -52,14 +54,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -72,18 +76,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { - %val = load i8, i8 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid + %val = load i8, i8 addrspace(1)* %in.gep %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out ret void @@ -116,11 +124,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() @@ -145,11 +153,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -158,11 +168,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -186,15 +198,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]] ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]] ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc ; GCN-DAG: buffer_store_dword [[RESULT0]] ; GCN-DAG: buffer_store_byte [[RESULT1]] ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -205,13 +219,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -221,13 +237,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 0 @@ -237,13 +255,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal ; Compare on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 1 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -253,13 +273,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 1 %sel = select i1 %cmp, i32 %ctlz, i32 0 diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index aa913ad406d2..68b39bad2bc1 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -8,6 +8,8 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + ; FUNC-LABEL: {{^}}s_ctpop_i32: ; GCN: s_load_dword [[SVAL:s[0-9]+]], ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] @@ -24,22 +26,24 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) ; XXX - Why 0 in register? ; FUNC-LABEL: {{^}}v_ctpop_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: buffer_load_dword [[VAL1:v[0-9]+]], -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] @@ -49,8 +53,11 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %val1 = load i32, i32 addrspace(1)* %in1, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid + %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4 + %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone %add = add i32 %ctpop0, %ctpop1 @@ -59,15 +66,17 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %add = add i32 %ctpop0, %sval +define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, %sval store i32 %add, i32 addrspace(1)* %out, align 4 ret void } @@ -80,7 +89,9 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -98,7 +109,9 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -124,7 +137,9 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -166,21 +181,25 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid + %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 ret void } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 4 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -188,14 +207,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 4, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -203,14 +224,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 99999 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -218,7 +241,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -226,7 +249,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, %const store i32 %add, i32 addrspace(1)* %out, align 4 @@ -234,7 +259,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -242,7 +267,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -250,18 +277,22 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}} -; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] +; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] +; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 + %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid %const = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index f18bd9fd8174..4850370851f6 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + declare i64 @llvm.ctpop.i64(i64) nounwind readnone declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone @@ -25,14 +27,16 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) } ; FUNC-LABEL: {{^}}v_ctpop_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -40,7 +44,7 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs } ; FUNC-LABEL: {{^}}v_ctpop_i64_user: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] @@ -49,7 +53,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %or = or i64 %ctpop, %s.val store i64 %or, i64 addrspace(1)* %out @@ -87,7 +93,9 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, < ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -105,7 +113,9 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, < ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -169,7 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] @@ -182,7 +193,9 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { - %val = load i128, i128 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid + %val = load i128, i128 addrspace(1)* %in.gep, align 8 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 1fa6407647eb..1bfd38d94bfd 100644 --- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -5,6 +5,7 @@ declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: ; SI: s_load_dword [[VAL:s[0-9]+]], @@ -21,21 +22,23 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 +; SI: {{buffer|flat}}_load_dwordx2 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: buffer_store_dwordx2 @@ -44,14 +47,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 +; SI: {{buffer|flat}}_load_dwordx4 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 @@ -64,7 +69,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 ret void diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 0328ce31002d..f839129fc3d8 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN-LABEL: {{^}}load_i8_to_f32: -; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]], ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}load_v2i8_to_v2f32: -; GCN: buffer_load_ushort [[LD:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid + %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid + %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v4i8_to_v4f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]] ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] @@ -53,7 +59,9 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -64,10 +72,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias ; FIXME: Packing bytes ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]] ; GCN-DAG: v_lshlrev_b32 ; GCN-DAG: v_or_b32 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], @@ -77,7 +85,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -124,14 +134,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; GCN-LABEL: {{^}}load_v7i8_to_v7f32: ; GCN: s_endpgm define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid + %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v8i8_to_v8f32: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] @@ -147,19 +159,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid + %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]], ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 %cvt = uitofp i32 %inreg to float @@ -169,7 +185,9 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 %cvt = uitofp i32 %shr to float @@ -181,7 +199,9 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias ; them so it shouldn't really matter. ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float store float %cvt, float addrspace(1)* %out, align 4 @@ -190,7 +210,9 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -198,12 +220,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no } ; GCN-LABEL: {{^}}extract_byte0_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %and = and i32 %val, 255 %cvt = uitofp i32 %and to float store float %cvt, float addrspace(1)* %out @@ -211,12 +235,14 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte1_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -225,12 +251,14 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte2_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -239,12 +267,14 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte3_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir index 3148b9b8ff9d..c265b8e2ad2e 100644 --- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -1,14 +1,4 @@ # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s ---- | - define amdgpu_kernel void @test0() { ret void } - define amdgpu_kernel void @test1() { ret void } - define amdgpu_kernel void @test2() { ret void } - define amdgpu_kernel void @test3() { ret void } - define amdgpu_kernel void @test4() { ret void } - define amdgpu_kernel void @test5() { ret void } - define amdgpu_kernel void @loop0() { ret void } - define amdgpu_kernel void @loop1() { ret void } - define amdgpu_kernel void @loop2() { ret void } ... --- # Combined use/def transfer check, the basics. diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index 2c474dbe7b08..deb90df99dcf 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -9,7 +9,7 @@ ; SI-LABEL: @simple_read2_f32 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { ; SI-LABEL: @simple_read2_f32_max_offset ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 3dfdaf3936a6..ef4efc6336ce 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -38,9 +38,9 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] -; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]] +; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -64,8 +64,8 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1) ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll index 81b35a46aa18..b1fba8c240d7 100644 --- a/test/CodeGen/AMDGPU/ds_read2st64.ll +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -7,7 +7,7 @@ ; SI-LABEL: @simple_read2st64_f32_0_1 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { @@ -26,7 +26,7 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 ; SI-LABEL: @simple_read2st64_f32_1_2 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { @@ -46,7 +46,7 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl ; SI-LABEL: @simple_read2st64_f32_max_offset ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll index ace01593808b..74404989f8c7 100644 --- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -1,4 +1,4 @@ -; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Most of these cases that don't trigger because of broken cost ; heuristics. Should not need -stress-early-ifcvt diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll index 9439130deb9e..792f0b1eaef4 100644 --- a/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/test/CodeGen/AMDGPU/early-if-convert.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: This leaves behind a now unnecessary and with exec diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll index 6eb1fc1d0cc2..b7dfcd99029a 100644 --- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll +++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll @@ -2,16 +2,21 @@ ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Test that the -enable-no-signed-zeros-fp-math flag works ; GCN-LABEL: {{^}}fneg_fsub_f32: -; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] ; GCN-UNSAFE-NOT: xor define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 - %a = load float, float addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %tid, 1 + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add + %a = load float, float addrspace(1)* %gep, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 %result = fsub float %a, %b %neg.result = fsub float -0.0, %result diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 34999fa3aea4..3fb452de1ccf 100644 --- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Make sure the add and load are reduced to 32-bits even with the ; bitcast to vector. ; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0: @@ -8,7 +10,9 @@ ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]] ; GCN: buffer_store_dword [[ADD]] define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -21,7 +25,9 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou ; GCN: v_add_f64 ; GCN: buffer_store_dword v define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { - %a = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %a = load double, double addrspace(1)* %gep %add = fadd double %a, %b %val.bc = bitcast double %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -34,7 +40,9 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out ; GCN: v_add_i32 ; GCN: buffer_store_dword define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x float> %extract = extractelement <2 x float> %val.bc, i32 0 diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll index 4e2ec4b3054f..d56d5ec1411a 100644 --- a/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fabs.f16.ll @@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { ; VI: flat_load_ushort [[HI:v[0-9]+]] ; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]] +; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]] ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]] +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]] ; VI: flat_store_dword ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -62,8 +62,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] ; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| -; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] +; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]] ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] @@ -134,7 +134,9 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %fmul = fmul <2 x half> %fabs, %val store <2 x half> %fmul, <2 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index 9edf55cbc69f..0c4a77964d15 100644 --- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -16,8 +16,8 @@ ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]] +; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]] @@ -49,7 +49,7 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { ; GCN: buffer_load_dword [[V:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] @@ -75,13 +75,13 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] -; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] +; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] +; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] -; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] +; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_mul_f32_e32 @@ -108,13 +108,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] -; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] +; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] +; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] -; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] +; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_mul_f32_e32 @@ -191,17 +191,17 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] -; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]] -; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]] +; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] +; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] ; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]] +; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] -; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 -; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]] +; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]] ; GCN: buffer_store_dword [[MUL]] ; GCN: buffer_store_dword [[MAD]] @@ -226,21 +226,21 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] -; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] +; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] -; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 -; GCN-SLOWFMA: v_subrev_f32_e32 +; GCN-SLOWFMA: v_sub_f32_e32 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll index 08199be144f4..88b3be0e0d31 100644 --- a/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/test/CodeGen/AMDGPU/fadd.f16.ll @@ -2,13 +2,13 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fadd_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fadd_f16( @@ -24,7 +24,7 @@ entry: } ; GCN-LABEL: {{^}}fadd_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -42,7 +42,7 @@ entry: } ; GCN-LABEL: {{^}}fadd_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -60,8 +60,8 @@ entry: } ; GCN-LABEL: {{^}}fadd_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -70,16 +70,16 @@ entry: ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -88,15 +88,18 @@ define amdgpu_kernel void @fadd_v2f16( <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> %a.val, %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -105,12 +108,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -118,14 +121,16 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -134,12 +139,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -147,8 +152,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0> store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index c936d98673ba..8fd1f52006fb 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -5,8 +5,11 @@ ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid + %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid + %r0 = load double, double addrspace(1)* %gep1 + %r1 = load double, double addrspace(1)* %gep2 %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out ret void @@ -42,3 +45,8 @@ define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x do store <2 x double> %r2, <2 x double> addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll new file mode 100644 index 000000000000..5383bbe71ae3 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -0,0 +1,487 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s + +; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %v = load float, float addrspace(1)* %gep, align 4 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32: +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fmul float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32: +; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float 15.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32: +; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fadd float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32: +; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.sqrt.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fceil_value_f32: +; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.ceil.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_floor_value_f32: +; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.floor.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fma_value_f32: +; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: +; GCN: flat_load_dword [[LOAD:v[0-9]+]], +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.canonicalize.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: +; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fpext float %load to double + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id + store double %canonicalized, double addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: +; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = fpext half %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: +; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v = fptrunc double %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32: +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fptrunc float %load to half + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: +; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} +; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]] +; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} +; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]] +; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %v = fptrunc <2 x float> %load to <2 x half> + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float -0.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fneg_value_f32: +; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = fsub float -0.0, %v0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.fabs.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fabs_value_f32: +; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.fabs.f32(float %v0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f32: +; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.sin.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f32: +; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.cos.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f16: +; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.sin.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f16: +; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.cos.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32: +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.minnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: +; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.maxnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32: +; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64: +; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v0 = fadd double %load, 0.0 + %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0) + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + store double %canonicalized, double addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +entry: + %v = fmul float %arg, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee: +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NEXT: ; return +; GCN-NOT: 1.0 +define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) { +entry: + %v = fmul nnan float %arg, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +declare float @llvm.canonicalize.f32(float) #0 +declare double @llvm.canonicalize.f64(double) #0 +declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.sqrt.f32(float) #0 +declare float @llvm.ceil.f32(float) #0 +declare float @llvm.floor.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.sin.f32(float) #0 +declare float @llvm.cos.f32(float) #0 +declare half @llvm.sin.f16(half) #0 +declare half @llvm.cos.f16(half) #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.maxnum.f64(double, double) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 404358f0ecb9..dd8e277c1c75 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -5,6 +5,8 @@ declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} @@ -213,7 +215,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -233,7 +237,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -251,7 +257,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) @@ -270,7 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll index 8c385f40b1c5..feb4c7bd4a18 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll index 7916226462f7..aef898b1a8ee 100644 --- a/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fcmp_f16_lt ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -351,23 +351,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lt: +; SI: v_cmp_lt_f32_e32 vcc, +; SI: v_cmp_lt_f32_e32 vcc, + +; VI: v_cmp_lt_f16_e32 vcc, +; VI: v_cmp_lt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -382,22 +371,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_eq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_eq_f32_e32 vcc, +; SI: v_cmp_eq_f32_e32 vcc, + +; VI: v_cmp_eq_f16_e32 vcc, +; VI: v_cmp_eq_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_eq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -411,23 +389,11 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_le -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_le: +; SI: v_cmp_le_f32_e32 vcc +; SI: v_cmp_le_f32_e32 vcc +; VI: v_cmp_le_f16_e32 vcc +; VI: v_cmp_le_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_le( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -441,23 +407,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_gt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_gt: +; SI: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e32 vcc, + +; VI: v_cmp_gt_f16_e32 vcc, +; VI: v_cmp_gt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_gt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -471,23 +426,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lg: +; SI: v_cmp_lg_f32_e32 vcc, +; SI: v_cmp_lg_f32_e32 vcc, + +; VI: v_cmp_lg_f16_e32 vcc, +; VI: v_cmp_lg_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -501,23 +445,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_ge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_ge: +; SI: v_cmp_ge_f32_e32 vcc, +; SI: v_cmp_ge_f32_e32 vcc, + +; VI: v_cmp_ge_f16_e32 vcc, +; VI: v_cmp_ge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -531,23 +464,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_o -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_o: +; SI: v_cmp_o_f32_e32 vcc, +; SI: v_cmp_o_f32_e32 vcc, + +; VI: v_cmp_o_f16_e32 vcc, +; VI: v_cmp_o_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_o( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -561,23 +483,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_u -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_u: +; SI: v_cmp_u_f32_e32 vcc, +; SI: v_cmp_u_f32_e32 vcc, + +; VI: v_cmp_u_f16_e32 vcc, +; VI: v_cmp_u_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_u( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -592,22 +503,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nge_f32_e32 vcc, +; SI: v_cmp_nge_f32_e32 vcc, + +; VI: v_cmp_nge_f16_e32 vcc, +; VI: v_cmp_nge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_nge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -622,22 +522,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nlg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nlg_f32_e32 vcc +; SI: v_cmp_nlg_f32_e32 vcc + +; VI: v_cmp_nlg_f16_e32 vcc +; VI: v_cmp_nlg_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_nlg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -652,22 +541,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_ngt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_ngt_f32_e32 vcc, +; SI: v_cmp_ngt_f32_e32 vcc, + +; VI: v_cmp_ngt_f16_e32 vcc, +; VI: v_cmp_ngt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ngt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -682,22 +560,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nle -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_nle( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -712,22 +579,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_neq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_neq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -744,17 +600,19 @@ entry: ; GCN-LABEL: {{^}}fcmp_v2f16_nlt ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] + +; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] +; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] + +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]] ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll index b9e1921d4c45..95f7e0be7d9c 100644 --- a/test/CodeGen/AMDGPU/fcmp64.ll +++ b/test/CodeGen/AMDGPU/fcmp64.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s ; CHECK-LABEL: {{^}}flt_f64: -; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -14,7 +14,7 @@ define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fle_f64: -; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -26,7 +26,7 @@ define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fgt_f64: -; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -38,7 +38,7 @@ define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fge_f64: -; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fne_f64: -; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1 } ; CHECK-LABEL: {{^}}feq_f64: -; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll index 125597796245..ca313d80894a 100644 --- a/test/CodeGen/AMDGPU/fconst64.ll +++ b/test/CodeGen/AMDGPU/fconst64.ll @@ -6,8 +6,15 @@ ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r1 = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid + %r1 = load double, double addrspace(1)* %gep %r2 = fadd double %r1, 5.000000e+00 store double %r2, double addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll index 4e2bf765cd95..8e984246cc94 100644 --- a/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.copysign.f16(half, half) declare float @llvm.copysign.f32(float, float) @@ -9,16 +9,18 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) +declare i32 @llvm.amdgcn.workitem.id.x() + ; GCN-LABEL: {{^}}test_copysign_f16: -; SI: buffer_load_ushort v[[SIGN:[0-9]+]] -; SI: buffer_load_ushort v[[MAG:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] -; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]] -; GFX89: buffer_load_ushort v[[MAG:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] @@ -36,8 +38,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] @@ -48,17 +50,20 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32( half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to float - %sign = load float, float addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) store float %out, float addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]] @@ -70,17 +75,20 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64( half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to double - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) store double %out, double addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]] @@ -93,8 +101,11 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16( float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) store float %out, float addrspace(1)* %arg_out @@ -102,8 +113,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]] @@ -116,8 +127,11 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16( double addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load double, double addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid + %mag = load double, double addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) store double %out, double addrspace(1)* %arg_out @@ -125,8 +139,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]] @@ -141,8 +155,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32( half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag - %sign = load float, float addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -150,8 +167,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] @@ -166,8 +183,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64( half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid %mag = load half, half addrspace(1)* %arg_mag - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -175,8 +195,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] @@ -193,9 +213,12 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16( float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep %mag.trunc = fptrunc float %mag to half - %sign = load half, half addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %out, half addrspace(1)* %arg_out ret void diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll index 7f84e973c958..333143393cb4 100644 --- a/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -27,7 +27,7 @@ ; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] ; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] -; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]] +; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -165,7 +165,7 @@ entry: ; VI: flat_load_ushort [[RHS:v[0-9]+]] ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -187,7 +187,7 @@ entry: ; VI: flat_load_ushort [[RHS:v[0-9]+]] ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index 738a5adba14f..bc489454341a 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -20,7 +20,7 @@ ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] @@ -45,7 +45,7 @@ entry: ; GCN-NOT: s_setreg ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] @@ -85,20 +85,11 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: -; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] -; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] -; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] - -; GCN-NOT: s_setreg -; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] -; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] ; GCN-NOT: s_setreg -; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] -; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b @@ -121,6 +112,21 @@ entry: ret void } +; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS + +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv fast float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS @@ -154,8 +160,9 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: -; GCN: v_cmp_gt_f32 -; GCN: v_cmp_gt_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN-NOT: v_cmp_gt_f32 define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index 4113ba8dc1f0..7526d08bdbe5 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math @@ -387,7 +387,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, @@ -403,7 +403,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, @@ -419,7 +419,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, @@ -451,7 +451,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, @@ -467,7 +467,7 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, @@ -483,7 +483,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, @@ -499,7 +499,7 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, @@ -515,7 +515,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, @@ -531,7 +531,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, @@ -563,7 +563,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, @@ -583,8 +583,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_interp: ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]] -; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] +; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] ; ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll index 4d3f3712621e..907121f1cd46 100644 --- a/test/CodeGen/AMDGPU/fma.f64.ll +++ b/test/CodeGen/AMDGPU/fma.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll index 659cecb59ebf..6be4c450a51e 100644 --- a/test/CodeGen/AMDGPU/fma.ll +++ b/test/CodeGen/AMDGPU/fma.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare float @llvm.fma.f32(float, float, float) nounwind readnone declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll index 7643c3ea533c..44c80b63bf7c 100644 --- a/test/CodeGen/AMDGPU/fmax_legacy.ll +++ b/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -51,7 +51,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -71,7 +71,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll index 27d9261b1fab..4cfc9fc80fb0 100644 --- a/test/CodeGen/AMDGPU/fmed3.ll +++ b/test/CodeGen/AMDGPU/fmed3.ll @@ -872,8 +872,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(fl ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]] +; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]] define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll index 52336f95a909..0494295fc15f 100644 --- a/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -83,7 +83,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -121,7 +121,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll index cd86409e2038..5f120f63d7fe 100644 --- a/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,14 +1,14 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fmul_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_f16( @@ -70,16 +70,16 @@ entry: ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -108,7 +108,7 @@ entry: ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( @@ -134,7 +134,7 @@ entry: ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll index f14233f267b2..d37d432842f3 100644 --- a/test/CodeGen/AMDGPU/fmul64.ll +++ b/test/CodeGen/AMDGPU/fmul64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FUNC-LABEL: {{^}}fmul_f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9b713419e747..980d68ceded8 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, @@ -108,7 +108,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, @@ -227,8 +227,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -257,8 +257,8 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -287,7 +287,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -319,7 +319,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -347,13 +347,13 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]] +; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -385,7 +385,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] ; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -416,7 +416,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { @@ -444,7 +444,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add ; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll index e42255026692..4b1e41ff91e1 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,12 +1,12 @@ -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. @@ -67,7 +67,7 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -96,7 +96,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -125,10 +125,10 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -160,10 +160,10 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -192,7 +192,7 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -221,7 +221,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -252,7 +252,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -282,7 +282,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -310,11 +310,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -345,11 +345,11 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -379,10 +379,10 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; SI: buffer_store_dword [[RESULT]] @@ -414,10 +414,10 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; SI: buffer_store_dword [[RESULT]] @@ -446,17 +446,17 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]] +; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] ; SI-FLUSH: buffer_store_dword [[REGC]] ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -489,10 +489,10 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -525,10 +525,10 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -556,10 +556,10 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll index 86e91e04b0fc..8d91a56ee421 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f64.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s ; GCN-LABEL: {{^}}fmuladd_f64: ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll index 624610096cbc..b50a26c023ca 100644 --- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s - -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 66bf9d0ffb00..002bc47fb96a 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -9,7 +9,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] @@ -31,7 +31,7 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[ADD]] @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] @@ -82,10 +82,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 +; GCN-SAFE: v_sub_f32_e32 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, -; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -106,10 +106,10 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -133,7 +133,7 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float ; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -157,11 +157,11 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl ; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}} ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]] -; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]] ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -185,10 +185,10 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* % ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} -; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] @@ -235,7 +235,7 @@ define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[ADD]] @@ -280,7 +280,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -300,7 +300,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -342,7 +342,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, fl ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -364,7 +364,7 @@ define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* % ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[MUL]] @@ -974,7 +974,7 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[ADD]] @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* % ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspac ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] @@ -1664,7 +1664,7 @@ define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addr ; GCN-LABEL: {{^}}v_fneg_round_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_trunc_f32_e32 -; GCN: v_subrev_f32_e32 +; GCN: v_sub_f32_e32 ; GCN: v_cndmask_b32 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] ; GCN: s_cbranch_scc1 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] -; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]] +; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] ; GCN: buffer_store_dword [[MUL1]] ; GCN: buffer_store_dword [[MUL0]] @@ -1851,7 +1851,7 @@ define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] ; GCN: ; use [[NEG]] ; GCN: buffer_store_dword [[MUL]] @@ -1984,8 +1984,8 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 -; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]] -; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]] +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] ; GCN: buffer_store_dword [[MUL1]] ; GCN-NEXT: buffer_store_dword [[MUL2]] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] -; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]] +; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] ; GCN: buffer_store_dword [[FMA0]] ; GCN: buffer_store_dword [[MUL1]] define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index f4afaca2b7a7..56aea641d16e 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| -; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} +; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]] ; GFX89-NOT: _and ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| @@ -20,7 +20,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: ; CI-DAG: v_cvt_f32_f16_e32 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| -; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} +; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]] ; CI: v_cvt_f16_f32_e32 ; GFX89-NOT: _and diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index 0a7346f410c9..3f20ca73e922 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index 2d94726cbe20..49d674252746 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -46,7 +46,7 @@ define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { ; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] ; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] -; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]] +; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]] ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir index 986c6b296c96..3155b7a8664f 100644 --- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -1,26 +1,5 @@ # RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 { - ret void - } - - attributes #0 = { nounwind } - ... ---- # GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32 # GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec # GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec @@ -62,14 +41,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -133,14 +112,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -204,14 +183,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -275,14 +254,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir index afde89d6b64b..51bb357fcf6e 100644 --- a/test/CodeGen/AMDGPU/fold-operands-order.mir +++ b/test/CodeGen/AMDGPU/fold-operands-order.mir @@ -1,10 +1,4 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @mov_in_use_list_2x() { - unreachable - } - ... --- diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 2c6b1cb18f7e..579a1454dd9a 100644 --- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll index 15cc73b9ee53..ec19fd199b4e 100644 --- a/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -154,7 +154,7 @@ entry: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] -; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] +; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]] ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] ; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll index f310618d8bdb..f593030764a9 100644 --- a/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptosi_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -60,7 +60,7 @@ entry: ; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]] ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll index 7641c08e33c3..cebe3304d542 100644 --- a/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptoui_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -60,7 +60,7 @@ entry: ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll index bc72f4424c98..64df625d4bb5 100644 --- a/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] @@ -38,10 +38,10 @@ entry: ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] @@ -68,7 +68,7 @@ entry: ; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll index 9a56cbe983cd..1314dfe3c7ca 100644 --- a/test/CodeGen/AMDGPU/fract.f64.ll +++ b/test/CodeGen/AMDGPU/fract.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll index 207fe280c9a6..2217f67da7d3 100644 --- a/test/CodeGen/AMDGPU/fract.ll +++ b/test/CodeGen/AMDGPU/fract.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 ; GCN-LABEL: {{^}}fract_f32: ; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[INPUT]], [[FLR]] ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll index 9778069d0477..3b8f58cc18a7 100644 --- a/test/CodeGen/AMDGPU/frem.ll +++ b/test/CodeGen/AMDGPU/frem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} @@ -29,7 +29,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} ; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] -; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] +; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[X]], [[INVY]] ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] ; GCN: buffer_store_dword [[RESULT]] diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll index 453d8fb37f2f..186757e4c5d8 100644 --- a/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll index a0fd3411ca05..6bd9a0db14f6 100644 --- a/test/CodeGen/AMDGPU/fsqrt.ll +++ b/test/CodeGen/AMDGPU/fsqrt.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll index fa00c06546db..15a4ce2d88f7 100644 --- a/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fsub_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fsub_f16( @@ -70,16 +70,16 @@ entry: ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] @@ -109,12 +109,12 @@ entry: ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0] @@ -143,12 +143,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}} diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll index e7a92d95d485..48647a2cdb89 100644 --- a/test/CodeGen/AMDGPU/fsub.ll +++ b/test/CodeGen/AMDGPU/fsub.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_fsub_f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 @@ -41,10 +41,10 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x flo } ; FUNC-LABEL: {{^}}v_fneg_fsub_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrs } ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -93,7 +93,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float a } ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %ou ; make sure it is disabled and the fneg is not folded if it is not ; "true". ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll index dc332414a152..73f1a69eeb9d 100644 --- a/test/CodeGen/AMDGPU/fsub64.ll +++ b/test/CodeGen/AMDGPU/fsub64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.fabs.f64(double) #0 diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll index 1f72ec65588e..bb2a6ba8e348 100644 --- a/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s declare double @llvm.trunc.f64(double) nounwind readnone declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll index 19e592f50bea..4e50f995d27e 100644 --- a/test/CodeGen/AMDGPU/global-extload-i16.ll +++ b/test/CodeGen/AMDGPU/global-extload-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented diff --git a/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/test/CodeGen/AMDGPU/global-smrd-unknown.ll new file mode 100644 index 000000000000..8a576e6480a1 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-smrd-unknown.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}unknown_memdep_analysis: +; GCN: flat_load_dword +; GCN: flat_load_dword +; GCN: flat_store_dword +define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp53 = load float, float addrspace(1)* undef, align 4 + %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31 + %tmp55 = load float, float addrspace(1)* %tmp54, align 4 + %tmp56 = tail call float @llvm.fmuladd.f32(float undef, float %tmp53, float %tmp55) + store float %tmp56, float addrspace(1)* undef, align 4 + ret void +} + +declare float @llvm.fmuladd.f32(float, float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index 41ae5a4a0b00..43745d4b3da3 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float for SI and lower. @@ -17,7 +17,7 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { @@ -471,10 +471,10 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, ; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] ; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] ; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]] +; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll index c2668a077b09..8cda01a10f76 100644 --- a/test/CodeGen/AMDGPU/imm.ll +++ b/test/CodeGen/AMDGPU/imm.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; Use a 64-bit value with lo bits that can be represented as an inline constant ; GCN-LABEL: {{^}}i64_imm_inline_lo: diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll index cd3502baee7b..fe86a5872968 100644 --- a/test/CodeGen/AMDGPU/immv216.ll +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: @@ -305,7 +305,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 0d20c32a4770..62200b988bea 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll index c0f5218efc16..75826d530cb0 100644 --- a/test/CodeGen/AMDGPU/inline-asm.ll +++ b/test/CodeGen/AMDGPU/inline-asm.ll @@ -222,9 +222,9 @@ entry: ; FIXME: Should be scheduled to shrink vcc ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: ; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1 +; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc define amdgpu_kernel void @i1_input_phys_vgpr_x2() { entry: %val0 = load volatile i1, i1 addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll index 5cd965d2fa9c..eea26192ed32 100644 --- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GatherAllAliases gives up on trying to analyze cases where the ; pointer may have been loaded from an aliased store, so make sure diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll index f08d4b6c7915..06dc2cc8b90e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fabs.f16(half %a) declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 1fcdac537fba..f71b9752e9a1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 2cc63ae74bf1..1b3e09a81e5a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s ; FIXME: Enable for VI. diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index fe211d356070..7068f4559055 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll index 593c95856811..871b8c4f99b9 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: ; GCN: v_bfe_i32 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll index 495e36b09f8f..39370e41e8aa 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.sffbh.i32(i32) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll index e0cec2134e70..8468aa3a7b3e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 92e3a1099da0..68fd08f778c4 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: ; GCN: v_bfe_u32 diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 0604a49372a2..071f2a6de4cd 100644 --- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.ceil.f16(half %a) declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll index d836ea36ef63..8931de63e74b 100644 --- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.cos.f16(half %a) declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) @@ -29,8 +29,8 @@ entry: ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -48,8 +48,8 @@ entry: ; GCN-NOT: and ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @cos_v2f16( diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll index 5757142b9e95..4e96a7619716 100644 --- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.exp2.f16(half %a) declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 6a18141d8035..74d1e694ffbe 100644 --- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.floor.f16(half %a) declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 3f4fba7d8ead..a379b18ffb8b 100644 --- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -128,7 +128,7 @@ define amdgpu_kernel void @fma_f16_imm_c( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16( @@ -167,7 +167,7 @@ define amdgpu_kernel void @fma_v2f16( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_a( @@ -210,7 +210,7 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_b( @@ -253,7 +253,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_c( diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 806723e5136c..2d4fe08d8bde 100644 --- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -13,11 +13,11 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] -; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI-FLUSH: buffer_store_short v[[C_F16]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] @@ -110,19 +110,19 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] ; VI-FLUSH-NOT: v_and_b32 -; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] +; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]] ; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -131,7 +131,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] ; VI-DENORM-NOT: v_and_b32 -; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]] +; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll index 773eb55283e4..277195c53208 100644 --- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.log2.f16(half %a) declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 8f4b314ffabb..c72716439a76 100644 --- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -9,9 +9,9 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16( @@ -73,18 +73,18 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -115,7 +115,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( @@ -143,7 +143,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 1a86286f7136..0e93acc27dc5 100644 --- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -9,9 +9,9 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_f16( @@ -72,18 +72,18 @@ entry: ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -116,7 +116,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_a( @@ -144,7 +144,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 30cb969a76e5..92282083984b 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -34,12 +34,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll index ffe87977870b..7e29147571f2 100644 --- a/test/CodeGen/AMDGPU/llvm.round.ll +++ b/test/CodeGen/AMDGPU/llvm.round.ll @@ -12,7 +12,7 @@ ; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] ; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] -; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]] ; GCN: buffer_store_dword [[RESULT]] ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] @@ -70,7 +70,7 @@ define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x floa ; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] ; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5 ; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]] -; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]] ; GFX89: buffer_store_short [[RESULT]] define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 { %x.arg.trunc = trunc i32 %x.arg to i16 diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll index eb1f32c981f8..08b9d9d873b4 100644 --- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) @@ -29,9 +29,9 @@ entry: ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] ; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] ; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] @@ -47,10 +47,10 @@ entry: ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 46ee6526aca2..0e1358ecca22 100644 --- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index dc7182aa0d89..37ee4e92c637 100644 --- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll index bd6fea587b42..77557a584093 100644 --- a/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f32: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll index 5b772e1fe5ee..84214b7dbc10 100644 --- a/test/CodeGen/AMDGPU/load-global-f64.ll +++ b/test/CodeGen/AMDGPU/load-global-f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index e3415b9c47de..cb2495d5fdcf 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll index 5df32c1e3120..6360d39666c7 100644 --- a/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll index de16b6c8997e..c71db0b7357c 100644 --- a/test/CodeGen/AMDGPU/load-global-i64.ll +++ b/test/CodeGen/AMDGPU/load-global-i64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll index fc0cbf916b52..3fe6bd26be14 100644 --- a/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i8: diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll index d6162c388b5b..f9ba6241fe06 100644 --- a/test/CodeGen/AMDGPU/load-weird-sizes.ll +++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i24: ; SI: {{flat|buffer}}_load_ubyte diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index 74564f387ede..e1a2af6c7ef9 100644 --- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 @@ -21,6 +22,17 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* ; OPT-NEXT: load i8 ; OPT: getelementptr ; OPT-NEXT: store i8 + +; WOPT-LABEL: @min_size_large_static_memcpy_caller0( +; WOPT-NOT: call +; WOPT: br label %load-store-loop +; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index +; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]] +; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index +; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]] +; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1 +; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025 +; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void diff --git a/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir new file mode 100644 index 000000000000..768acf35eeae --- /dev/null +++ b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir @@ -0,0 +1,227 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: cluster_add_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +name: cluster_add_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: interleave_add64s +# GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec +# GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec +# GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +name: interleave_add64s +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: sreg_64 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: sreg_64 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: sreg_64 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: sreg_64 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %4 = V_MOV_B32_e32 0, implicit %exec + %5 = V_MOV_B32_e32 0, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + + %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec + %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec + + + %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec + %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +... + +# GCN-LABEL: name: cluster_mov_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %2 = S_MOV_B64 0 +# GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +name: cluster_mov_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = S_MOV_B64 0 + S_NOP 0, implicit def %vcc + %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +... + +# GCN-LABEL: name: no_cluster_add_addc_diff_sgpr +# GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %8 = S_MOV_B64 0 +# GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +name: no_cluster_add_addc_diff_sgpr +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: sreg_64 } +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %8 = S_MOV_B64 0 + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +... +# GCN-LABEL: name: cluster_sub_subb +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +name: cluster_sub_subb +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_cmp_cndmask +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +name: cluster_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask2 +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask2 +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll index b855fc500c6b..8a6bf853a7c6 100644 --- a/test/CodeGen/AMDGPU/mad-combine.ll +++ b/test/CodeGen/AMDGPU/mad-combine.ll @@ -19,15 +19,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-SLOWFMAF-NOT: v_fma ; SI-DENORM-SLOWFMAF-NOT: v_mad -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] @@ -55,15 +55,15 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] -; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] +; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] +; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -99,11 +99,11 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] @@ -133,8 +133,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -167,9 +167,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -205,8 +205,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -238,9 +238,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -278,7 +278,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -313,8 +313,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -355,9 +355,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -395,13 +395,13 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] +; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] -; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] +; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -437,13 +437,13 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] -; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm @@ -479,21 +479,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]] -; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]] +; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] +; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]] ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]] +; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]] -; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] +; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm @@ -530,21 +530,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]] -; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]] +; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] +; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]] ; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] -; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 8e0014911def..77c35fac8b5d 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] -; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -199,7 +199,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: buffer_load_dword [[VGPR:v[0-9]+]] ; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { bb: diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll index 6bc40e82459b..b78d65ae1e1a 100644 --- a/test/CodeGen/AMDGPU/madmk.ll +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -32,8 +32,8 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] -; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]] ; GCN: s_endpgm define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll index ffcdac03bc74..6387c9ff6dfa 100644 --- a/test/CodeGen/AMDGPU/max.ll +++ b/test/CodeGen/AMDGPU/max.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imax_sge_i32: diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index dfd5b97fcc86..6b0ec483247c 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s ; This test is mostly to test DAG store merging, so disable the vectorizer. ; Run with devices with different unaligned load restrictions. diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index b23b21118aaa..97666492e376 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s declare i32 @llvm.amdgcn.workitem.id.x() readnone diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index 57c50c9804e5..a0290789175d 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC ; mul24 and mad24 are affected diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 82c27f204a47..ba3ff0b08bc9 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -66,9 +66,9 @@ ; FIXME: Why is this compare essentially repeated? ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 +; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll index 8a7bf6db5b8d..500e4cb3cc73 100644 --- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll index eb082843fb82..8e6885c4fc5e 100644 --- a/test/CodeGen/AMDGPU/or.ll +++ b/test/CodeGen/AMDGPU/or.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}or_v2i32: diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll index f83eb56dc6ed..776b151e3017 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0 declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0 diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll index ecb513cd80b6..d8c7438e4d0d 100644 --- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll +++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir new file mode 100644 index 000000000000..bac348aaed70 --- /dev/null +++ b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir @@ -0,0 +1,162 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s +# +# See bug http://llvm.org/PR33524 for details of the problem being checked here +# This test will provoke a subrange join (see annotations below) during simple register coalescing +# Without a fix for PR33524 this causes an unreachable in SubRange Join +# +# GCN-DAG: undef %[[REG0:[0-9]+]].sub0 = COPY %sgpr5 +# GCN-DAG: undef %[[REG1:[0-9]+]].sub0 = COPY %sgpr2 +# GCN-DAG: %[[REG0]].sub1 = S_MOV_B32 1 +# GCN-DAG: %[[REG1]].sub1 = S_MOV_B32 1 + +--- | + define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 { + ret void + } + +... +--- +name: regcoal-subrange-join +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vreg_128 } + - { id: 3, class: vreg_128 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32_xm0, preferred-register: '%8' } + - { id: 7, class: vreg_128 } + - { id: 8, class: sreg_32_xm0, preferred-register: '%6' } + - { id: 9, class: vreg_128 } + - { id: 10, class: sgpr_32 } + - { id: 11, class: sgpr_32 } + - { id: 12, class: sgpr_32 } + - { id: 13, class: sgpr_32 } + - { id: 14, class: sgpr_32 } + - { id: 15, class: sgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: sreg_32_xm0 } + - { id: 18, class: sreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32_xm0 } + - { id: 21, class: sreg_64 } + - { id: 22, class: sreg_32_xm0_xexec } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64_xexec } + - { id: 25, class: sreg_128 } + - { id: 26, class: sreg_64_xexec } + - { id: 27, class: sreg_32_xm0_xexec } + - { id: 28, class: sreg_32_xm0 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vgpr_32 } + - { id: 32, class: vgpr_32 } + - { id: 33, class: vgpr_32 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } + - { id: 36, class: vgpr_32 } + - { id: 37, class: vgpr_32 } + - { id: 38, class: sreg_128 } + - { id: 39, class: sreg_64_xexec } + - { id: 40, class: sreg_32_xm0_xexec } + - { id: 41, class: sreg_32_xm0 } + - { id: 42, class: vgpr_32 } + - { id: 43, class: vgpr_32 } + - { id: 44, class: vgpr_32 } + - { id: 45, class: vgpr_32 } + - { id: 46, class: vgpr_32 } + - { id: 47, class: vgpr_32 } + - { id: 48, class: vgpr_32 } + - { id: 49, class: vgpr_32 } + - { id: 50, class: vgpr_32 } + - { id: 51, class: sreg_128 } + - { id: 52, class: vgpr_32 } + - { id: 53, class: vgpr_32 } + - { id: 54, class: vgpr_32 } + - { id: 55, class: vgpr_32 } + - { id: 56, class: vreg_128 } + - { id: 57, class: vreg_128 } + - { id: 58, class: vreg_128 } + - { id: 59, class: sreg_32_xm0 } + - { id: 60, class: sreg_32_xm0 } + - { id: 61, class: vreg_128 } +liveins: + - { reg: '%sgpr2', virtual-reg: '%12' } + - { reg: '%sgpr5', virtual-reg: '%15' } +body: | + bb.0: + liveins: %sgpr2, %sgpr5 + + %15 = COPY killed %sgpr5 + %12 = COPY killed %sgpr2 + %17 = S_MOV_B32 1 + undef %18.sub1 = COPY %17 + %0 = COPY %18 + %0.sub0 = COPY killed %12 + %21 = COPY killed %18 + %21.sub0 = COPY killed %15 + %22 = S_LOAD_DWORD_IMM killed %21, 2, 0 + %23 = S_MOV_B32 491436 + undef %24.sub0 = COPY killed %22 + %24.sub1 = COPY killed %23 + %25 = S_LOAD_DWORDX4_IMM killed %24, 0, 0 + %1 = COPY killed %25 + %26 = S_LOAD_DWORDX2_IMM %0, 2, 0 + dead %27 = S_LOAD_DWORD_IMM killed %26, 0, 0 + S_CBRANCH_SCC0 %bb.1, implicit undef %scc + + bb.5: + %58 = COPY killed %1 + %59 = COPY killed %17 + S_BRANCH %bb.2 + + bb.1: + %30 = V_MOV_B32_e32 1036831949, implicit %exec + %31 = V_ADD_F32_e32 %30, %1.sub3, implicit %exec + %33 = V_ADD_F32_e32 %30, %1.sub2, implicit %exec + %35 = V_ADD_F32_e32 %30, %1.sub1, implicit %exec + %37 = V_ADD_F32_e32 killed %30, killed %1.sub0, implicit %exec + undef %56.sub0 = COPY killed %37 + %56.sub1 = COPY killed %35 + %56.sub2 = COPY killed %33 + %56.sub3 = COPY killed %31 + %28 = S_MOV_B32 0 + %2 = COPY killed %56 + %58 = COPY killed %2 + %59 = COPY killed %28 + + bb.2: + %4 = COPY killed %59 + %3 = COPY killed %58 + %39 = S_LOAD_DWORDX2_IMM killed %0, 6, 0 + %40 = S_LOAD_DWORD_IMM killed %39, 0, 0 + %43 = V_MOV_B32_e32 -1102263091, implicit %exec + %60 = COPY killed %4 + %61 = COPY killed %3 + + bb.3: + successors: %bb.3, %bb.4 + + %7 = COPY killed %61 + %6 = COPY killed %60 + %8 = S_ADD_I32 killed %6, 1, implicit-def dead %scc + %44 = V_ADD_F32_e32 %43, %7.sub3, implicit %exec + %46 = V_ADD_F32_e32 %43, %7.sub2, implicit %exec + %48 = V_ADD_F32_e32 %43, %7.sub1, implicit %exec + %50 = V_ADD_F32_e32 %43, killed %7.sub0, implicit %exec + undef %57.sub0 = COPY killed %50 + %57.sub1 = COPY killed %48 + %57.sub2 = COPY %46 + %57.sub3 = COPY killed %44 + S_CMP_LT_I32 %8, %40, implicit-def %scc + %60 = COPY killed %8 + %61 = COPY killed %57 + S_CBRANCH_SCC1 %bb.3, implicit killed %scc + S_BRANCH %bb.4 + + bb.4: + EXP 32, undef %53, undef %54, killed %46, undef %55, 0, 0, 15, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll index ff4069226a62..260b32ed3406 100644 --- a/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: ; SI: buffer_load_dwordx4 diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll index 266490718dd1..fa29d789cebe 100644 --- a/test/CodeGen/AMDGPU/rotl.i64.ll +++ b/test/CodeGen/AMDGPU/rotl.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotl_i64: ; BOTH-DAG: s_lshl_b64 diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll index 9eda479cd25c..af58b404ca6c 100644 --- a/test/CodeGen/AMDGPU/rotr.i64.ll +++ b/test/CodeGen/AMDGPU/rotr.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotr_i64: ; BOTH-DAG: s_sub_i32 diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll index 9462683efe0e..204eeb998386 100644 --- a/test/CodeGen/AMDGPU/rsq.ll +++ b/test/CodeGen/AMDGPU/rsq.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone @@ -48,8 +48,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float ; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] ; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] -; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]] +; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI-UNSAFE: buffer_store_dword [[RESULT]] ; SI-SAFE-NOT: v_rsq_f32 diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll index a131aaa3dfb4..797fbc2712b0 100644 --- a/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll index f7a1c65881d0..ee56e9053fd3 100644 --- a/test/CodeGen/AMDGPU/sad.ll +++ b/test/CodeGen/AMDGPU/sad.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll index 586a455b2b91..09e87d524419 100644 --- a/test/CodeGen/AMDGPU/saddo.ll +++ b/test/CodeGen/AMDGPU/saddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index 6e1dd1638333..d5b2fa0b6754 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll index 62d0d9367885..0f09fa17423e 100644 --- a/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; XXX - Why the packing? ; GCN-LABEL: {{^}}scalar_to_vector_v2i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] -; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]] ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll index 44d46086f02a..2dddba8bccc7 100644 --- a/test/CodeGen/AMDGPU/schedule-global-loads.ll +++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FIXME: This currently doesn't do a great job of clustering the ; loads, which end up with extra moves between them. Right now, it diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index 6b1e85915a11..4ae9871865f5 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll index abd15f1fb47f..6ed730ad60f4 100644 --- a/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/test/CodeGen/AMDGPU/scratch-simple.ll @@ -12,10 +12,8 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] -; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200 -; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400 -; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]] -; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]] +; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] +; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll index 7ec6ca809b68..305107f690fb 100644 --- a/test/CodeGen/AMDGPU/sdiv.ll +++ b/test/CodeGen/AMDGPU/sdiv.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by sdiv is long and complex and may frequently change. ; The goal of this test is to make sure the ISel doesn't fail. diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 0dc7cc309f7c..0d181c2c34b8 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -35,7 +35,7 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* ; GCN-LABEL: {{^}}mul_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -68,9 +68,9 @@ entry: ; GCN-LABEL: {{^}}mul_v2i16: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -168,14 +168,14 @@ entry: ; GCN-LABEL: {{^}}mul_v2half: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_f16_sdwa ; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -362,9 +362,9 @@ entry: ; GCN-LABEL: {{^}}mac_v2half: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mac_f16_sdwa ; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -491,7 +491,7 @@ entry: %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - + %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 ret void diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll index 3417eb02b361..e0619251f920 100644 --- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -103,7 +103,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -122,7 +122,7 @@ define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -154,7 +154,7 @@ define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -171,7 +171,7 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -191,7 +191,7 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -245,7 +245,7 @@ define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Z:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -266,8 +266,8 @@ define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[W:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]] define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -291,7 +291,7 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { ; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]] +; GCN-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[Z]], [[SELECT]] ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[NEG_X]] @@ -316,8 +316,8 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) ; GCN: buffer_load_dword [[W:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]] define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -341,7 +341,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -359,7 +359,7 @@ define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -377,7 +377,7 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -397,7 +397,7 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -414,7 +414,7 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -431,7 +431,7 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -445,7 +445,7 @@ define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -479,7 +479,7 @@ define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -497,7 +497,7 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -517,7 +517,7 @@ define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { ; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -540,7 +540,7 @@ define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]] ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -563,7 +563,7 @@ define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -585,7 +585,7 @@ define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -606,7 +606,7 @@ define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -628,7 +628,7 @@ define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index ebbc675b2bab..b77ebcf5bf52 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll index 92ee2eb7f403..e79ce3af0cf9 100644 --- a/test/CodeGen/AMDGPU/select.f16.ll +++ b/test/CodeGen/AMDGPU/select.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}select_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -104,8 +104,8 @@ entry: ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -134,8 +134,8 @@ entry: ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -159,16 +159,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 ; SI: v_cmp_lt_f32_e32 ; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; VI: v_cmp_lt_f16_e64 ; VI: v_cmp_lt_f16_e32 -; VI: v_cndmask_b32_e64 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 ; VI: v_cndmask_b32_e32 ; GCN: s_endpgm @@ -196,13 +196,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_gt_f32_e64 -; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_gt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm @@ -228,13 +232,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_lt_f32_e64 -; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5 -; VI: v_cmp_gt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_gt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 @@ -263,8 +270,8 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cmp_nlt_f32_e32 -; SI: v_cmp_nlt_f32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_nlt_f32_e32 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_nlt_f16_e32 @@ -298,13 +305,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 + ; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 ; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; VI: v_cndmask_b32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll index 8d455d84bf9e..bcaa1aa54c15 100644 --- a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll +++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll @@ -7,7 +7,7 @@ ; GCN: buffer_load_dword [[B:v[0-9]+]] ; GCN: buffer_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @multi_use_fneg_src() #0 { @@ -30,7 +30,7 @@ define amdgpu_kernel void @multi_use_fneg_src() #0 { ; GCN: buffer_load_dword [[B:v[0-9]+]] ; GCN: buffer_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]] ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]] define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 { @@ -78,7 +78,7 @@ define amdgpu_kernel void @multi_use_fneg() #0 { ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: buffer_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]] ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]] ; GCN: buffer_store_dword [[MUL1]] diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll index f63719d62a84..a3bf167e756a 100644 --- a/test/CodeGen/AMDGPU/setcc.ll +++ b/test/CodeGen/AMDGPU/setcc.ll @@ -7,8 +7,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y -; GCN-DAG: v_cmp_eq_u32_e32 -; GCN-DAG: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> @@ -23,9 +23,9 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index 160fb6a038fe..5b4d9ed259b6 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. ; FIXME: r600 fails verifier diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll index fb0bbaa9cbf2..8250bad7b0a1 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; Copy VGPR -> SGPR used twice as an instruction operand, which is then ; used in an REG_SEQUENCE that also needs to be handled. diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll index 931051102cd5..3b24cf82d783 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 4f7b61adc91d..2f9eed457ab6 100644 --- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Extract the high bit of the 1st quarter ; GCN-LABEL: {{^}}v_uextract_bit_31_i128: @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] +; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[ELT1PART]], v[[SHLLO]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index c70eb9b9c4a5..670287ba7937 100644 --- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll index 5306e190a4f9..f3faa39c64e6 100644 --- a/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s ; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0 diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index edc313ee323b..13ac9140b827 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 6248d8a46daf..767118eb8d11 100644 --- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -6,92 +6,7 @@ # that the post-RA run does manage to shrink it, but right now the # resume crashes ---- | - define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = sub i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = sub i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - declare i32 @llvm.amdgcn.workitem.id.x() #1 - - attributes #0 = { nounwind } - attributes #1 = { nounwind readnone } - ... ---- # GCN-LABEL: name: shrink_add_vop3{{$}} # GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec # GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec @@ -151,13 +66,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -166,11 +81,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -235,13 +150,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -250,11 +165,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -319,13 +234,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -334,11 +249,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -402,13 +317,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -417,18 +332,18 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %9 = S_MOV_B64 0 %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... --- # GCN-LABEL: name: shrink_addc_vop3{{$}} -# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec +# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit %vcc, implicit %exec # GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec name: shrink_addc_vop3 @@ -487,13 +402,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -502,19 +417,19 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %vcc = S_MOV_B64 0 %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... --- # GCN-LABEL: name: shrink_addc_undef_vcc{{$}} -# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec +# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit undef %vcc, implicit %exec # GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec name: shrink_addc_undef_vcc alignment: 0 @@ -572,13 +487,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -587,11 +502,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 348c7200c0bc..17109187d538 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll index 3e452c214e98..c80945f390be 100644 --- a/test/CodeGen/AMDGPU/sign_extend.ll +++ b/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll index 574d1c0b2c78..0bcef99df39f 100644 --- a/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}sitofp_i16_to_f16 ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll index 827d672022eb..41430715f347 100644 --- a/test/CodeGen/AMDGPU/sminmax.ll +++ b/test/CodeGen/AMDGPU/sminmax.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}s_abs_i32: ; GCN: s_abs_i32 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind ; FUNC-LABEL: {{^}}v_abs_i32: ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]] ; GCN: v_add_i32 ; EG: MAX_INT @@ -34,7 +34,7 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* % ; GCN-LABEL: {{^}}v_abs_i32_repeat_user: ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] -; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]] +; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]] ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]] define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 @@ -71,8 +71,8 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] ; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -132,10 +132,10 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> % ; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] ; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -184,8 +184,8 @@ define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace( ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] -; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] +; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] +; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index a9aac2d8abb7..27263429650d 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll index 1ca0919258a8..cbf9f37e29ef 100644 --- a/test/CodeGen/AMDGPU/spill-cfg-position.ll +++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s ; Inline spiller can decide to move a spill as early as possible in the basic block. ; It will skip phis and label, but we also need to make sure it skips instructions diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index 44cfdf6398ae..74618b263bad 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll index e06725892089..51eaf9a960b0 100644 --- a/test/CodeGen/AMDGPU/srem.ll +++ b/test/CodeGen/AMDGPU/srem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index cb40ecf2de1c..8878b4538555 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll index 135632343f90..d65c2adc7e20 100644 --- a/test/CodeGen/AMDGPU/ssubo.ll +++ b/test/CodeGen/AMDGPU/ssubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll index 1d407ea9bcda..14bedceed6ee 100644 --- a/test/CodeGen/AMDGPU/sub.i16.ll +++ b/test/CodeGen/AMDGPU/sub.i16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -68,7 +68,7 @@ define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] +; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index e7655df15520..46f1b120f212 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index ee923e2b8b61..8d5c8b64efb8 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -6,7 +6,7 @@ ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -165,10 +165,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( ; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} @@ -201,8 +201,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI-DAG: v_subrev_u16_e32 -; VI-DAG: v_subrev_u16_e32 +; VI: v_sub_u16_e32 +; VI: v_sub_u16_e32 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -228,8 +228,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: v_subrev_u16_e32 -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 +; VI: v_sub_u16_e32 ; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/test/CodeGen/AMDGPU/syncscopes.ll b/test/CodeGen/AMDGPU/syncscopes.ll new file mode 100644 index 000000000000..3741ce788993 --- /dev/null +++ b/test/CodeGen/AMDGPU/syncscopes.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-before=si-debugger-insert-nops < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: name: syncscopes +; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) +; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) +; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) +define void @syncscopes( + i32 %agent, + i32 addrspace(4)* %agent_out, + i32 %workgroup, + i32 addrspace(4)* %workgroup_out, + i32 %wavefront, + i32 addrspace(4)* %wavefront_out) { +entry: + store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4 + store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index f90040385f75..77a6820713d6 100644 --- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll index 0c91d52df0c0..da038f4b0597 100644 --- a/test/CodeGen/AMDGPU/trunc.ll +++ b/test/CodeGen/AMDGPU/trunc.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s declare i32 @llvm.r600.read.tidig.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll index 632ccaa7e612..5754bd9bb913 100644 --- a/test/CodeGen/AMDGPU/uaddo.ll +++ b/test/CodeGen/AMDGPU/uaddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_uaddo_i64_zext: ; GCN: s_add_u32 @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_uaddo_i32_novcc: -; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG: ADDC_UINT ; EG: ADD_INT diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll index d9dab0d40acf..1d683776bfd5 100644 --- a/test/CodeGen/AMDGPU/udiv.ll +++ b/test/CodeGen/AMDGPU/udiv.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll index 0c3b0fcaf854..eaa1d073cafb 100644 --- a/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uitofp_i16_to_f16 ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll index fb4eab43a2d6..823c918dcda7 100644 --- a/test/CodeGen/AMDGPU/urem.ll +++ b/test/CodeGen/AMDGPU/urem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by urem is long and complex and may frequently ; change. The goal of this test is to make sure the ISel doesn't fail diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll index d1f454f0bc65..f01bf498e0d8 100644 --- a/test/CodeGen/AMDGPU/usubo.ll +++ b/test/CodeGen/AMDGPU/usubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_usubo_i64_zext: ; GCN: s_sub_u32 @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_usubo_i32_novcc: -; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_usubo_i16: -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 ; VI: v_cmp_gt_u16_e32 define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll index d4a68a418ee4..5cbfae34e1bb 100644 --- a/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/test/CodeGen/AMDGPU/v_cndmask.ll @@ -200,9 +200,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* % ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc -; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s +; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -292,10 +292,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs ; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: ; GCN: load_dword ; GCN: load_ubyte -; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v +; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v ; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v -; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc +; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v +; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s ; GCN: store_byte define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll index 2b96f7d50076..da57155f33ef 100644 --- a/test/CodeGen/AMDGPU/v_mac.ll +++ b/test/CodeGen/AMDGPU/v_mac.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN: buffer_store_dword [[C]] define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: @@ -135,7 +135,7 @@ entry: ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]] +; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll index ce4a69db3506..46c9b7ee1a3d 100644 --- a/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] @@ -8,10 +8,10 @@ ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] -; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm define amdgpu_kernel void @mac_f16( @@ -147,9 +147,9 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( half addrspace(1)* %r, @@ -171,9 +171,9 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( half addrspace(1)* %r, @@ -312,20 +312,20 @@ entry: ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -481,14 +481,14 @@ entry: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( @@ -513,14 +513,14 @@ entry: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll index 90cf34e609f6..381ff5b1b518 100644 --- a/test/CodeGen/AMDGPU/vectorize-global-local.ll +++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 diff --git a/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir new file mode 100644 index 000000000000..f8a2339626cf --- /dev/null +++ b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir @@ -0,0 +1,161 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s +--- | + + define amdgpu_kernel void @fold_fi_vgpr() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_vgpr_fi() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_sgpr_fi() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_fi_sgpr() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_fi_imm() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_imm_fi() { + %alloca = alloca [4 x i32] + ret void + } + +... +# GCN-LABEL: name: fold_fi_vgpr{{$}} +# GCN: %1 = IMPLICIT_DEF + +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_fi_vgpr +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_vgpr_fi{{$}} +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_vgpr_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_sgpr_fi{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec +name: fold_sgpr_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_fi_sgpr{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec +name: fold_fi_sgpr +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM +... +# TODO: Should probably prefer folding immediate first +# GCN-LABEL: name: fold_fi_imm{{$}} +# GCN: %1 = V_MOV_B32_e32 999, implicit %exec +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_fi_imm +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = V_MOV_B32_e32 999, implicit %exec + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_imm_fi{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %2 = V_ADD_I32_e32 999, %0, implicit-def %vcc, implicit %exec +name: fold_imm_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = V_MOV_B32_e32 999, implicit %exec + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM diff --git a/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir new file mode 100644 index 000000000000..b4c0c93347c2 --- /dev/null +++ b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s +... +# GCN-LABEL: name: fold_imm_non_ssa{{$}} +# GCN: %0 = V_MOV_B32_e32 123, implicit %exec +# GCN: %2 = V_ADD_I32_e32 456, %0, implicit-def %vcc, implicit %exec + +name: fold_imm_non_ssa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } +body: | + bb.0: + %0 = COPY undef %0 + %0 = V_MOV_B32_e32 123, implicit %exec + %1 = V_MOV_B32_e32 456, implicit %exec + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_partially_defined_superreg{{$}} +# GCN: %1 = V_MOV_B32_e32 456, implicit %exec +# GCN: %2 = V_ADD_I32_e32 123, %1, implicit-def %vcc, implicit %exec +name: fold_partially_defined_superreg +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + undef %3.sub0 = V_MOV_B32_e32 123, implicit %exec, implicit-def %3 + %1 = V_MOV_B32_e32 456, implicit %exec + %2, %vcc = V_ADD_I32_e64 %3.sub0, %1, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll index bb6234729f90..02ffd30be5fd 100644 --- a/test/CodeGen/AMDGPU/vselect.ll +++ b/test/CodeGen/AMDGPU/vselect.ll @@ -7,7 +7,9 @@ ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; SI: v_cndmask_b32_e64 +; SI: v_cmp_gt_i32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_i32_e32 vcc ; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { @@ -25,8 +27,11 @@ entry: ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e32 + +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: @@ -45,12 +50,10 @@ entry: ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; FIXME: The shrinking does not happen on tonga - -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: @@ -68,6 +71,10 @@ entry: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { entry: %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir index 44dbd38f2d30..5612c7cac00b 100644 --- a/test/CodeGen/AMDGPU/waitcnt-permute.mir +++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -1,18 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s ---- | - define float @waitcnt-permute(i32 %x, i32 %y) { - entry: - %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) - %1 = bitcast i32 %0 to float - %2 = fadd float 1.000000e+00, %1 - ret float %2 - } - - declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) - ... ---- # CHECK-LABEL: name: waitcnt-permute{{$}} # CHECK: DS_BPERMUTE_B32 # CHECK-NEXT: S_WAITCNT 127 diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll index 57a082a0170c..847a1d739321 100644 --- a/test/CodeGen/AMDGPU/xor.ll +++ b/test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}xor_v2i32: @@ -60,7 +60,7 @@ define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* ; FUNC-LABEL: {{^}}v_xor_i1: ; SI: buffer_load_ubyte [[B:v[0-9]+]] ; SI: buffer_load_ubyte [[A:v[0-9]+]] -; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] +; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] ; SI: buffer_store_byte [[RESULT]] define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll index a902234898cd..69c42afb9ad5 100644 --- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll +++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll @@ -6,7 +6,7 @@ ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]] +; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]] ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]] +; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]] ; GCN-NOT: v[[HI]] ; GCN-NOT: _or_ ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 |