diff options
Diffstat (limited to 'test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll')
-rw-r--r-- | test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll | 1400 |
1 files changed, 1400 insertions, 0 deletions
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll new file mode 100644 index 000000000000..ff840e6411c1 --- /dev/null +++ b/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll @@ -0,0 +1,1400 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s + +define <4 x float> @test_4xfloat_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2) { +; CHECK-LABEL: test_4xfloat_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1],xmm1[3,1] +; CHECK-NEXT: retq + %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5> + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[2,1],xmm1[3,1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],xmm1[3,1] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,2],xmm1[3,2] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2],xmm1[3,2] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,3],xmm1[2,1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[2,1] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2) { +; CHECK-LABEL: test_4xfloat_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; CHECK-NEXT: retq + %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7> + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[3,3],xmm1[3,3] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],xmm1[3,3] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) { +; CHECK-LABEL: test_4xfloat_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[1,2] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6> + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,0],mem[1,2] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0],mem[1,2] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,3],mem[1,3] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],mem[1,3] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],mem[2,0] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],mem[2,0] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) { +; CHECK-LABEL: test_4xfloat_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1],mem[3,2] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6> + ret <4 x float> %res +} +define <4 x float> @test_4xfloat_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1],mem[3,2] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 + ret <4 x float> %res +} + +define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) { +; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],mem[3,2] +; CHECK-NEXT: retq + %vec2 = load <4 x float>, <4 x float>* %vec2p + %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6> + %cmp = fcmp oeq <4 x float> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer + ret <4 x float> %res +} + +define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { +; CHECK-LABEL: test_8xfloat_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6] +; CHECK-NEXT: retq + %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14> + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { +; CHECK-LABEL: test_8xfloat_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6] +; CHECK-NEXT: retq + %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14> + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { +; CHECK-LABEL: test_8xfloat_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12> + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { +; CHECK-LABEL: test_8xfloat_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13> + ret <8 x float> %res +} +define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 + ret <8 x float> %res +} + +define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { +; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5] +; CHECK-NEXT: retq + %vec2 = load <8 x float>, <8 x float>* %vec2p + %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13> + %cmp = fcmp oeq <8 x float> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer + ret <8 x float> %res +} + +define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) { +; CHECK-LABEL: test_16xfloat_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14] +; CHECK-NEXT: retq + %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30> + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14] +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15] +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13] +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) { +; CHECK-LABEL: test_16xfloat_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14] +; CHECK-NEXT: retq + %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30> + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14] +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14] +; CHECK-NEXT: retq + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { +; CHECK-LABEL: test_16xfloat_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30> + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14] +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14] +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14] +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { +; CHECK-LABEL: test_16xfloat_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31> + ret <16 x float> %res +} +define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15] +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 + ret <16 x float> %res +} + +define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) { +; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15] +; CHECK-NEXT: retq + %vec2 = load <16 x float>, <16 x float>* %vec2p + %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31> + %cmp = fcmp oeq <16 x float> %mask, zeroinitializer + %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer + ret <16 x float> %res +} + +define <2 x double> @test_2xdouble_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2) { +; CHECK-LABEL: test_2xdouble_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; CHECK-NEXT: retq + %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + ret <2 x double> %res +} +define <2 x double> @test_2xdouble_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 + ret <2 x double> %res +} + +define <2 x double> @test_2xdouble_zero_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0] +; CHECK-NEXT: retq + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer + ret <2 x double> %res +} +define <2 x double> @test_2xdouble_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 + ret <2 x double> %res +} + +define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0] +; CHECK-NEXT: retq + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer + ret <2 x double> %res +} +define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) { +; CHECK-LABEL: test_2xdouble_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0] +; CHECK-NEXT: retq + %vec2 = load <2 x double>, <2 x double>* %vec2p + %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + ret <2 x double> %res +} +define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <2 x double>, <2 x double>* %vec2p + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 + ret <2 x double> %res +} + +define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0] +; CHECK-NEXT: retq + %vec2 = load <2 x double>, <2 x double>* %vec2p + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer + ret <2 x double> %res +} + +define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %vec2 = load <2 x double>, <2 x double>* %vec2p + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 + ret <2 x double> %res +} + +define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) { +; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0] +; CHECK-NEXT: retq + %vec2 = load <2 x double>, <2 x double>* %vec2p + %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2> + %cmp = fcmp oeq <2 x double> %mask, zeroinitializer + %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer + ret <2 x double> %res +} + +define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { +; CHECK-LABEL: test_4xdouble_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3] +; CHECK-NEXT: retq + %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7> + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[3] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { +; CHECK-LABEL: test_4xdouble_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; CHECK-NEXT: retq + %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7> + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; CHECK-NEXT: retq + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { +; CHECK-LABEL: test_4xdouble_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6> + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[2] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[1],ymm0[2],mem[2] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[1],ymm0[2],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[3],mem[2] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[3],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { +; CHECK-LABEL: test_4xdouble_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[2],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6> + ret <4 x double> %res +} +define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[2],mem[2] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 + ret <4 x double> %res +} + +define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { +; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[2],mem[2] +; CHECK-NEXT: retq + %vec2 = load <4 x double>, <4 x double>* %vec2p + %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6> + %cmp = fcmp oeq <4 x double> %mask, zeroinitializer + %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer + ret <4 x double> %res +} + +define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) { +; CHECK-LABEL: test_8xdouble_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: retq + %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15> + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vmovapd %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7] +; CHECK-NEXT: vmovapd %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6] +; CHECK-NEXT: vmovapd %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) { +; CHECK-LABEL: test_8xdouble_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7] +; CHECK-NEXT: retq + %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15> + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7] +; CHECK-NEXT: vmovapd %zmm2, %zmm0 +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7] +; CHECK-NEXT: retq + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { +; CHECK-LABEL: test_8xdouble_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15> + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { +; CHECK-LABEL: test_8xdouble_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14> + ret <8 x double> %res +} +define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6] +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 + ret <8 x double> %res +} + +define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) { +; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6] +; CHECK-NEXT: retq + %vec2 = load <8 x double>, <8 x double>* %vec2p + %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14> + %cmp = fcmp oeq <8 x double> %mask, zeroinitializer + %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer + ret <8 x double> %res +} + |