aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86/avx512-shuffle-schedule.ll
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /test/CodeGen/X86/avx512-shuffle-schedule.ll
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz
src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'test/CodeGen/X86/avx512-shuffle-schedule.ll')
-rwxr-xr-xtest/CodeGen/X86/avx512-shuffle-schedule.ll5700
1 files changed, 2162 insertions, 3538 deletions
diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll
index 618909c151fa..7bd2368696cf 100755
--- a/test/CodeGen/X86/avx512-shuffle-schedule.ll
+++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -23,8 +23,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -32,8 +31,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -47,16 +45,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -68,8 +64,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -77,8 +72,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -92,16 +86,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -113,8 +105,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -122,8 +113,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -137,16 +127,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -173,8 +161,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve
; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -182,8 +169,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve
; SKX-LABEL: test_masked_16xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -197,16 +183,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %
; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -218,7 +202,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi16_perm_mem_mask0:
@@ -234,16 +218,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -257,16 +239,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -280,16 +260,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -303,16 +281,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -326,16 +302,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -349,16 +323,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -372,7 +344,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi16_perm_mem_mask3:
@@ -388,16 +360,14 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16>
; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -411,16 +381,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1
; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -433,7 +401,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1
define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
; GENERIC-LABEL: test_32xi16_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -448,18 +416,16 @@ define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -472,17 +438,15 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve
define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
@@ -493,18 +457,16 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %
define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -517,17 +479,15 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve
define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
@@ -538,18 +498,16 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %
define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -562,17 +520,15 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve
define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
@@ -583,7 +539,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %
define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
; GENERIC-LABEL: test_32xi16_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -598,18 +554,16 @@ define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -622,17 +576,15 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve
define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
@@ -643,8 +595,8 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %
define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi16_perm_mem_mask0:
@@ -659,17 +611,15 @@ define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -682,17 +632,15 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -705,17 +653,15 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -728,17 +674,15 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -751,17 +695,15 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -774,17 +716,15 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -797,8 +737,8 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1
define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi16_perm_mem_mask3:
@@ -813,17 +753,15 @@ define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -836,17 +774,15 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -875,8 +811,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -884,8 +819,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -899,16 +833,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
@@ -920,8 +852,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -929,8 +860,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -944,16 +874,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
@@ -965,8 +893,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -974,8 +901,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -989,16 +915,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
@@ -1025,8 +949,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -1034,8 +957,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
; SKX-LABEL: test_masked_8xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1049,16 +971,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask
; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
@@ -1070,7 +990,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_perm_mem_mask0:
@@ -1086,16 +1006,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1109,16 +1027,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1132,16 +1048,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1155,16 +1069,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1178,16 +1090,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1201,16 +1111,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1224,7 +1132,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_perm_mem_mask3:
@@ -1240,16 +1148,14 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve
; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1263,16 +1169,14 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %
; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -1285,7 +1189,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %
define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -1300,18 +1204,16 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1324,17 +1226,15 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
@@ -1345,18 +1245,16 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1369,17 +1267,15 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
@@ -1390,18 +1286,16 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1414,17 +1308,15 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
@@ -1435,7 +1327,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
; GENERIC-LABEL: test_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -1450,18 +1342,16 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1474,17 +1364,15 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve
define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
@@ -1495,8 +1383,8 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %
define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_perm_mem_mask0:
@@ -1511,17 +1399,15 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1534,17 +1420,15 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1557,17 +1441,15 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3
define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1580,17 +1462,15 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1603,17 +1483,15 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3
define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1626,17 +1504,15 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1649,8 +1525,8 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3
define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_perm_mem_mask3:
@@ -1665,17 +1541,15 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1688,17 +1562,15 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32>
define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -1724,16 +1596,14 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1746,15 +1616,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -1765,16 +1633,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1787,15 +1653,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -1806,16 +1670,14 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1828,15 +1690,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
@@ -1860,16 +1720,14 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1882,15 +1740,13 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2,
define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
@@ -1901,7 +1757,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask
define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_perm_mem_mask0:
@@ -1915,15 +1771,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1936,15 +1790,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1957,15 +1809,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1978,15 +1828,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -1999,15 +1847,13 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2020,15 +1866,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2041,7 +1885,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %
define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_perm_mem_mask3:
@@ -2055,15 +1899,13 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2076,15 +1918,13 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve
define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i64>, <4 x i64>* %vp
@@ -2097,7 +1937,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %
define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
; GENERIC-LABEL: test_8xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -2112,18 +1952,16 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2136,17 +1974,15 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2,
define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
@@ -2157,16 +1993,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2179,15 +2013,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
@@ -2198,18 +2030,16 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2222,17 +2052,15 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2,
define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
@@ -2256,16 +2084,14 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2278,15 +2104,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
@@ -2297,18 +2121,16 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2321,17 +2143,15 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2,
define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
@@ -2342,16 +2162,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2364,15 +2182,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -2383,7 +2199,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %
define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
; GENERIC-LABEL: test_8xi64_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -2398,18 +2214,16 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2422,17 +2236,15 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2,
define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
@@ -2443,16 +2255,14 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask
define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2465,15 +2275,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -2484,8 +2292,8 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %
define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_perm_mem_mask0:
@@ -2500,17 +2308,15 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2523,17 +2329,15 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2546,15 +2350,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2567,15 +2369,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2588,17 +2388,15 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2611,17 +2409,15 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2634,7 +2430,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
@@ -2648,15 +2444,13 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2669,15 +2463,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2690,17 +2482,15 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i6
define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2713,17 +2503,15 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2736,15 +2524,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2757,15 +2543,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2778,8 +2562,8 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i6
define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_perm_mem_mask6:
@@ -2794,17 +2578,15 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2817,17 +2599,15 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve
define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2840,15 +2620,13 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %
define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2861,15 +2639,13 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64>
define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i64>, <8 x i64>* %vp
@@ -2898,8 +2674,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2907,8 +2682,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2922,16 +2696,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
@@ -2943,8 +2715,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2952,8 +2723,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -2967,16 +2737,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
@@ -2988,8 +2756,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -2997,8 +2764,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3012,16 +2778,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
@@ -3048,8 +2812,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float>
; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@@ -3057,8 +2820,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float>
; SKX-LABEL: test_masked_8xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3072,16 +2834,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32>
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
@@ -3093,7 +2853,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_perm_mem_mask0:
@@ -3109,16 +2869,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3132,16 +2890,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3155,16 +2911,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3178,16 +2932,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3201,16 +2953,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3224,16 +2974,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3247,7 +2995,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mas
; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_perm_mem_mask3:
@@ -3263,16 +3011,14 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl
; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3286,16 +3032,14 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x
; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x float>, <8 x float>* %vp
@@ -3308,7 +3052,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x
define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
; GENERIC-LABEL: test_16xfloat_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -3323,18 +3067,16 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3347,17 +3089,15 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl
define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
@@ -3368,18 +3108,16 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x
define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3392,17 +3130,15 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl
define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
@@ -3413,18 +3149,16 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x
define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3437,17 +3171,15 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl
define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
@@ -3458,7 +3190,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x
define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
; GENERIC-LABEL: test_16xfloat_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -3473,18 +3205,16 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3497,17 +3227,15 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl
define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
@@ -3518,8 +3246,8 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x
define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_perm_mem_mask0:
@@ -3534,17 +3262,15 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3557,17 +3283,15 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16
define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3580,17 +3304,15 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1
define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3603,17 +3325,15 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16
define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3626,17 +3346,15 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1
define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3649,17 +3367,15 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16
define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3672,8 +3388,8 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1
define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_perm_mem_mask3:
@@ -3688,17 +3404,15 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3711,17 +3425,15 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16
define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x float>, <16 x float>* %vp
@@ -3747,16 +3459,14 @@ define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3769,15 +3479,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
@@ -3788,16 +3496,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i
define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3810,15 +3516,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
@@ -3829,16 +3533,14 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i
define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3851,15 +3553,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
@@ -3883,16 +3583,14 @@ define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -3905,15 +3603,13 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou
define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
@@ -3924,7 +3620,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i
define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_perm_mem_mask0:
@@ -3938,15 +3634,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -3959,15 +3653,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -3980,15 +3672,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4
define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4001,15 +3691,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4022,15 +3710,13 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4
define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4043,15 +3729,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4064,7 +3748,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4
define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_perm_mem_mask3:
@@ -4078,15 +3762,13 @@ define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4099,15 +3781,13 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x
define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x double>, <4 x double>* %vp
@@ -4120,7 +3800,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4
define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
; GENERIC-LABEL: test_8xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -4135,18 +3815,16 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4159,17 +3837,15 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou
define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
@@ -4180,16 +3856,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4202,15 +3876,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
@@ -4221,18 +3893,16 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8
define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4245,17 +3915,15 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou
define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
@@ -4279,16 +3947,14 @@ define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4301,15 +3967,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
@@ -4320,18 +3984,16 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8
define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4344,17 +4006,15 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou
define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
@@ -4365,16 +4025,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4387,15 +4045,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
@@ -4406,7 +4062,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8
define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
; GENERIC-LABEL: test_8xdouble_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@@ -4421,18 +4077,16 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4445,17 +4099,15 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou
define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
@@ -4466,16 +4118,14 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i
define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4488,15 +4138,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
@@ -4507,8 +4155,8 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8
define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_perm_mem_mask0:
@@ -4523,17 +4171,15 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4546,17 +4192,15 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4569,15 +4213,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4590,15 +4232,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4611,17 +4251,15 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp
define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4634,17 +4272,15 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4657,7 +4293,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8
define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
@@ -4671,15 +4307,13 @@ define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4692,15 +4326,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4713,17 +4345,15 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp
define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4736,17 +4366,15 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4759,15 +4387,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4780,15 +4406,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4801,8 +4425,8 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp
define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_perm_mem_mask6:
@@ -4817,17 +4441,15 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4840,17 +4462,15 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x
define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
; SKX: # %bb.0:
; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4863,15 +4483,13 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8
define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4884,15 +4502,13 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp,
define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x double>, <8 x double>* %vp
@@ -4918,16 +4534,14 @@ define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4940,15 +4554,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -4959,16 +4571,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -4981,15 +4591,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -5000,16 +4608,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask
define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5022,15 +4628,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
@@ -5054,16 +4658,14 @@ define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5076,15 +4678,13 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2,
define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
@@ -5112,16 +4712,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5135,16 +4733,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5158,16 +4754,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5181,16 +4775,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5204,16 +4796,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5227,16 +4817,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5266,16 +4854,14 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve
; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5289,16 +4875,14 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %
; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i8>, <16 x i8>* %vp
@@ -5311,7 +4895,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %
define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
; GENERIC-LABEL: test_32xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi8_perm_mask0:
@@ -5324,16 +4908,14 @@ define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5346,15 +4928,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
@@ -5365,16 +4945,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5387,15 +4965,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
@@ -5406,16 +4982,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5428,15 +5002,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
@@ -5447,7 +5019,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask
define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
; GENERIC-LABEL: test_32xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi8_perm_mask3:
@@ -5460,16 +5032,14 @@ define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5482,15 +5052,13 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2,
define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
@@ -5502,7 +5070,7 @@ define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi8_perm_mem_mask0:
@@ -5518,16 +5086,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5541,16 +5107,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5564,16 +5128,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5587,16 +5149,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5610,16 +5170,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5633,16 +5191,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5656,7 +5212,7 @@ define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi8_perm_mem_mask3:
@@ -5672,16 +5228,14 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve
; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5695,16 +5249,14 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %
; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i8>, <32 x i8>* %vp
@@ -5717,7 +5269,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %
define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
; GENERIC-LABEL: test_64xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_64xi8_perm_mask0:
@@ -5730,16 +5282,14 @@ define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5752,15 +5302,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
@@ -5771,16 +5319,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5793,15 +5339,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
@@ -5812,16 +5356,14 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5834,15 +5376,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
@@ -5853,7 +5393,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
; GENERIC-LABEL: test_64xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_64xi8_perm_mask3:
@@ -5866,16 +5406,14 @@ define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -5888,15 +5426,13 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2,
define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
@@ -5907,8 +5443,8 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask
define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_64xi8_perm_mem_mask0:
@@ -5923,17 +5459,15 @@ define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5946,17 +5480,15 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve
define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5969,17 +5501,15 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %
define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -5992,17 +5522,15 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve
define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6015,17 +5543,15 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %
define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6038,17 +5564,15 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve
define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6061,8 +5585,8 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %
define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_64xi8_perm_mem_mask3:
@@ -6077,17 +5601,15 @@ define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6100,17 +5622,15 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve
define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
+; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
; SKX: # %bb.0:
; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <64 x i8>, <64 x i8>* %vp
@@ -6136,16 +5656,14 @@ define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6158,15 +5676,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
@@ -6177,16 +5693,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6199,15 +5713,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6218,16 +5730,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6240,15 +5750,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
@@ -6272,16 +5780,14 @@ define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6294,15 +5800,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6313,16 +5817,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %
define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6335,15 +5837,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
@@ -6354,16 +5854,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6376,15 +5874,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -6408,16 +5904,14 @@ define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6430,15 +5924,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v
define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
@@ -6449,16 +5941,14 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16>
define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6471,15 +5961,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve
define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -6504,15 +5992,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6525,15 +6011,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6546,15 +6030,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6567,15 +6049,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6588,15 +6068,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6609,15 +6087,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6644,15 +6120,13 @@ define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6665,15 +6139,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6686,15 +6158,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6707,15 +6177,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6728,15 +6196,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6749,15 +6215,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6784,15 +6248,13 @@ define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6805,15 +6267,13 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16
define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6826,15 +6286,13 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i
define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6847,15 +6305,13 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16>
define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i16>, <8 x i16>* %vp
@@ -6881,16 +6337,14 @@ define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6903,15 +6357,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
@@ -6922,16 +6374,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6944,15 +6394,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -6963,16 +6411,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -6985,15 +6431,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
@@ -7017,16 +6461,14 @@ define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7039,15 +6481,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7058,16 +6498,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7080,15 +6518,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
@@ -7099,16 +6535,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7121,15 +6555,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -7153,16 +6585,14 @@ define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7175,15 +6605,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16
define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
@@ -7194,16 +6622,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i
define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7216,15 +6642,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16>
define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
@@ -7235,7 +6659,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i1
define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
@@ -7249,15 +6673,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7270,15 +6692,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7291,15 +6711,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7312,15 +6730,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7333,15 +6749,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7354,15 +6768,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7375,7 +6787,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16
define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
@@ -7389,15 +6801,13 @@ define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7410,15 +6820,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7431,15 +6839,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7452,15 +6858,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7473,15 +6877,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7494,15 +6896,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7515,7 +6915,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16
define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
@@ -7529,15 +6929,13 @@ define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7550,15 +6948,13 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7571,15 +6967,13 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16
define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7592,15 +6986,13 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x
define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -7626,16 +7018,14 @@ define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7648,15 +7038,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
@@ -7667,16 +7055,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7689,15 +7075,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7708,16 +7092,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7730,15 +7112,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
@@ -7762,16 +7142,14 @@ define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7784,15 +7162,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -7803,16 +7179,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7825,15 +7199,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
@@ -7844,16 +7216,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7866,15 +7236,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7898,16 +7266,14 @@ define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7920,15 +7286,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16
define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
@@ -7939,16 +7303,14 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i
define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -7961,15 +7323,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16>
define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -7980,7 +7340,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i1
define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
@@ -7994,15 +7354,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8015,15 +7373,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8036,15 +7392,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8057,15 +7411,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8078,15 +7430,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8099,15 +7449,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8120,7 +7468,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32
define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
@@ -8134,15 +7482,13 @@ define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8155,15 +7501,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8176,15 +7520,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8197,15 +7539,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8218,17 +7558,15 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8241,17 +7579,15 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
; SKX: # %bb.0:
; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8264,7 +7600,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32
define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
@@ -8278,15 +7614,13 @@ define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8299,15 +7633,13 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8320,15 +7652,13 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32
define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8341,15 +7671,13 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x
define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
@@ -8375,16 +7703,14 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8397,15 +7723,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
@@ -8416,16 +7740,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8438,15 +7760,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
@@ -8457,16 +7777,14 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask
define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8479,15 +7797,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
@@ -8511,16 +7827,14 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:0.50]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8533,15 +7847,13 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2,
define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -8566,15 +7878,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8587,15 +7897,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8608,15 +7916,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8629,15 +7935,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8650,15 +7954,13 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %
define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8671,15 +7973,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8706,15 +8006,13 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8727,15 +8025,13 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve
define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <4 x i32>, <4 x i32>* %vp
@@ -8761,16 +8057,14 @@ define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8783,15 +8077,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
@@ -8802,16 +8094,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mas
define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8824,15 +8114,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
@@ -8843,16 +8131,14 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mas
define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8865,15 +8151,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
@@ -8897,16 +8181,14 @@ define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -8919,15 +8201,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2,
define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
@@ -8952,15 +8232,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -8973,15 +8251,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -8994,15 +8270,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32>
define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9015,15 +8289,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9036,15 +8308,13 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32>
define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9057,15 +8327,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9092,15 +8360,13 @@ define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9113,15 +8379,13 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v
define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <8 x i32>, <8 x i32>* %vp
@@ -9147,16 +8411,14 @@ define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9169,15 +8431,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
@@ -9188,16 +8448,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9210,15 +8468,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
@@ -9229,16 +8485,14 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9251,15 +8505,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
@@ -9283,16 +8535,14 @@ define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9305,15 +8555,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v
define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
@@ -9324,7 +8572,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32>
define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_16xi32_perm_mem_mask0:
@@ -9338,15 +8586,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9359,15 +8605,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9380,15 +8624,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i
define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9401,15 +8643,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9422,15 +8662,13 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i
define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9443,15 +8681,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9464,7 +8700,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i
define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_16xi32_perm_mem_mask3:
@@ -9478,15 +8714,13 @@ define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9499,15 +8733,13 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32
define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <16 x i32>, <16 x i32>* %vp
@@ -9533,16 +8765,14 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9555,15 +8785,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9574,16 +8802,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9596,15 +8822,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9615,16 +8839,14 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9637,15 +8859,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9669,16 +8889,14 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9691,15 +8909,13 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9724,16 +8940,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9747,15 +8961,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9768,16 +8980,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9791,15 +9001,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9812,16 +9020,14 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9835,15 +9041,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9870,16 +9074,14 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9893,15 +9095,13 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -9914,12 +9114,12 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1,
define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
ret <16 x float> %res
@@ -9927,16 +9127,14 @@ define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float>
define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9949,15 +9147,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -9968,16 +9164,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -9990,15 +9184,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
@@ -10009,16 +9201,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10031,15 +9221,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -10050,12 +9238,12 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
; GENERIC-LABEL: test_16xfloat_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
ret <16 x float> %res
@@ -10063,16 +9251,14 @@ define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float>
define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10085,15 +9271,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x
define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -10104,12 +9288,12 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
%res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -10118,16 +9302,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x flo
define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10141,15 +9323,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10162,16 +9342,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec
define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10185,15 +9363,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10206,16 +9382,14 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec
define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10229,15 +9403,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10250,12 +9422,12 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec
define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
%res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
@@ -10264,16 +9436,14 @@ define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x flo
define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10287,15 +9457,13 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1
define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -10321,16 +9489,14 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10343,15 +9509,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10362,16 +9526,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10384,15 +9546,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10403,16 +9563,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10425,15 +9583,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10457,16 +9613,14 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10479,15 +9633,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10512,16 +9664,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10535,15 +9685,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10556,16 +9704,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10579,15 +9725,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10600,16 +9744,14 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10623,15 +9765,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10658,16 +9798,14 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10681,15 +9819,13 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -10715,16 +9851,14 @@ define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double>
define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10737,15 +9871,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
@@ -10756,16 +9888,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10778,15 +9908,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -10797,16 +9925,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10819,15 +9945,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
@@ -10851,16 +9975,14 @@ define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double>
define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10873,15 +9995,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d
define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
@@ -10892,7 +10012,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
@@ -10906,16 +10026,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x doub
define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10929,15 +10047,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10950,16 +10066,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec
define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -10973,15 +10087,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -10994,16 +10106,14 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec
define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11017,15 +10127,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11038,7 +10146,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec
define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
@@ -11052,16 +10160,14 @@ define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x doub
define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11075,15 +10181,13 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -11109,16 +10213,14 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11131,15 +10233,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11150,16 +10250,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11172,15 +10270,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11191,16 +10287,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11213,15 +10307,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11245,16 +10337,14 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11267,15 +10357,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11286,7 +10374,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask0:
@@ -11300,16 +10388,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11323,15 +10409,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11344,16 +10428,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11367,15 +10449,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11388,16 +10468,14 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11411,15 +10489,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11432,7 +10508,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask3:
@@ -11446,16 +10522,14 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11469,15 +10543,13 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -11490,12 +10562,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i
define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
; GENERIC-LABEL: test_16xi32_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
ret <16 x i32> %res
@@ -11503,16 +10575,14 @@ define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11525,15 +10595,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -11544,16 +10612,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11566,15 +10632,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11585,16 +10649,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11607,15 +10669,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -11626,12 +10686,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
; GENERIC-LABEL: test_16xi32_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
ret <16 x i32> %res
@@ -11639,16 +10699,14 @@ define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11661,15 +10719,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %
define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -11680,12 +10736,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i
define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
@@ -11694,16 +10750,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11717,15 +10771,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11738,16 +10790,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11761,15 +10811,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11782,16 +10830,14 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11805,15 +10851,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11826,12 +10870,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16
define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -11840,16 +10884,14 @@ define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %ve
define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11863,15 +10905,13 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3
define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -11897,16 +10937,14 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11919,15 +10957,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -11938,16 +10974,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -11960,15 +10994,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11979,16 +11011,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12001,15 +11031,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12033,16 +11061,14 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12055,15 +11081,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12074,7 +11098,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask0:
@@ -12088,16 +11112,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12111,15 +11133,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12132,16 +11152,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12155,15 +11173,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12176,16 +11192,14 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12199,15 +11213,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12220,7 +11232,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask3:
@@ -12234,16 +11246,14 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12257,15 +11267,13 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -12291,16 +11299,14 @@ define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12313,15 +11319,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -12332,16 +11336,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12354,15 +11356,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -12373,16 +11373,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12395,15 +11393,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -12427,16 +11423,14 @@ define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12449,15 +11443,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2
define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -12468,7 +11460,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64>
define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_shuff_mem_mask0:
@@ -12482,16 +11474,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12505,15 +11495,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12526,16 +11514,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12549,15 +11535,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12570,16 +11554,14 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12593,15 +11575,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12614,7 +11594,7 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i
define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_shuff_mem_mask3:
@@ -12628,16 +11608,14 @@ define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p)
define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
-; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12651,15 +11629,13 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>*
define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -12685,16 +11661,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float>
define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12707,15 +11681,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12726,16 +11698,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12748,15 +11718,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12767,16 +11735,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12789,15 +11755,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12821,16 +11785,14 @@ define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float>
define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12843,15 +11805,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -12876,16 +11836,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x fl
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12899,15 +11857,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12920,16 +11876,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %v
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12943,15 +11897,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -12964,16 +11916,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %v
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -12987,15 +11937,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13022,16 +11970,14 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x fl
define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13045,15 +11991,13 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -13079,16 +12023,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float>
define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13101,15 +12043,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13120,16 +12060,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13142,15 +12080,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13161,16 +12097,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13183,15 +12117,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13215,16 +12147,14 @@ define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float>
define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13237,15 +12167,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -13270,16 +12198,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x fl
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13293,15 +12219,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13314,16 +12238,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %v
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13337,15 +12259,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13358,16 +12278,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %v
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13381,15 +12299,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13416,16 +12332,14 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x fl
define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13439,15 +12353,13 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -13460,7 +12372,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %v
define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_low_mask0:
@@ -13473,16 +12385,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x fl
define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13495,15 +12405,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13514,16 +12422,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %ve
define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13536,15 +12442,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13555,16 +12459,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %ve
define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13577,15 +12479,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13596,7 +12496,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %ve
define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_low_mask3:
@@ -13609,16 +12509,14 @@ define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x fl
define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13631,15 +12529,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -13650,7 +12546,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %ve
define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0:
@@ -13664,16 +12560,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13687,15 +12581,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13708,16 +12600,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float>
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13731,15 +12621,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13752,16 +12640,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float>
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13775,15 +12661,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13796,7 +12680,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float>
define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3:
@@ -13810,16 +12694,14 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13833,15 +12715,13 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec
define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -13867,16 +12747,14 @@ define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x dou
define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13889,15 +12767,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13908,16 +12784,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %ve
define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13930,15 +12804,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -13963,16 +12835,14 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x
define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -13986,15 +12856,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14007,16 +12875,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double>
define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14030,15 +12896,13 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec
define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -14064,16 +12928,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x dou
define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14086,15 +12948,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14105,16 +12965,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %ve
define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14127,15 +12985,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14146,16 +13002,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %ve
define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14168,15 +13022,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14200,16 +13052,14 @@ define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x dou
define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14222,15 +13072,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -14255,16 +13103,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14278,15 +13124,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14299,16 +13143,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double>
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14322,15 +13164,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14343,16 +13183,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double>
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14366,15 +13204,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14401,16 +13237,14 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x
define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14424,15 +13258,13 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec
define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -14445,7 +13277,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double>
define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
; GENERIC-LABEL: test_8xdouble_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_low_mask0:
@@ -14458,16 +13290,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x dou
define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14480,15 +13310,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14499,16 +13327,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %ve
define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14521,15 +13347,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14540,16 +13364,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %ve
define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14562,15 +13384,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14581,7 +13401,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %ve
define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
; GENERIC-LABEL: test_8xdouble_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_low_mask3:
@@ -14594,16 +13414,14 @@ define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x dou
define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14616,15 +13434,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -14635,7 +13451,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %ve
define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0:
@@ -14649,16 +13465,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14672,15 +13486,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14693,16 +13505,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double>
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14716,15 +13526,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14737,16 +13545,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double>
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14760,15 +13566,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14781,7 +13585,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double>
define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3:
@@ -14795,16 +13599,14 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x
define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14818,15 +13620,13 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec
define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -14852,16 +13652,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float
define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14874,15 +13672,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14893,16 +13689,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1
define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14915,15 +13709,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14934,16 +13726,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1
define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -14956,15 +13746,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -14988,16 +13776,14 @@ define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float
define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15010,15 +13796,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -15043,16 +13827,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x f
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15066,15 +13848,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15087,16 +13867,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15110,15 +13888,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15131,16 +13907,14 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15154,15 +13928,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15189,16 +13961,14 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x f
define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15212,15 +13982,13 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1,
define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -15246,16 +14014,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float
define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15268,15 +14034,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15287,16 +14051,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1
define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15309,15 +14071,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15328,16 +14088,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1
define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15350,15 +14108,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15382,16 +14138,14 @@ define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float
define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15404,15 +14158,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -15437,16 +14189,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x f
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15460,15 +14210,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15481,16 +14229,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15504,15 +14250,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15525,16 +14269,14 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15548,15 +14290,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15583,16 +14323,14 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x f
define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15606,15 +14344,13 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1,
define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -15627,7 +14363,7 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %
define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
; GENERIC-LABEL: test_16xfloat_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_high_mask0:
@@ -15640,16 +14376,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x f
define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15662,15 +14396,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15681,16 +14413,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %v
define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15703,15 +14433,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15722,16 +14450,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %v
define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15744,15 +14470,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15763,7 +14487,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %v
define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
; GENERIC-LABEL: test_16xfloat_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_high_mask3:
@@ -15776,16 +14500,14 @@ define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x f
define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
-; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15798,15 +14520,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1,
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -15817,7 +14537,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %v
define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0:
@@ -15831,16 +14551,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15854,15 +14572,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15875,16 +14591,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15898,15 +14612,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15919,16 +14631,14 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -15942,15 +14652,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -15963,7 +14671,7 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float
define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3:
@@ -15977,16 +14685,14 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16
define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
-; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16000,15 +14706,13 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve
define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -16034,16 +14738,14 @@ define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x do
define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16056,15 +14758,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1,
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16075,16 +14775,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %v
define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16097,15 +14795,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1,
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -16130,16 +14826,14 @@ define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2
define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16153,15 +14847,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16174,16 +14866,14 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double
define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16197,15 +14887,13 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve
define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -16231,16 +14919,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x do
define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16253,15 +14939,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16272,16 +14956,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %v
define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16294,15 +14976,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16313,16 +14993,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %v
define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16335,15 +15013,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16367,16 +15043,14 @@ define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x do
define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16389,15 +15063,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1,
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -16422,16 +15094,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16445,15 +15115,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16466,16 +15134,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16489,15 +15155,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16510,16 +15174,14 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16533,15 +15195,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16568,16 +15228,14 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16591,15 +15249,13 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %ve
define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -16612,7 +15268,7 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double
define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
; GENERIC-LABEL: test_8xdouble_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_high_mask0:
@@ -16625,16 +15281,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x do
define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16647,15 +15301,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16666,16 +15318,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %v
define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16688,15 +15338,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16707,16 +15355,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %v
define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16729,15 +15375,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16748,7 +15392,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %v
define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
; GENERIC-LABEL: test_8xdouble_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_high_mask3:
@@ -16761,16 +15405,14 @@ define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x do
define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
-; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16783,15 +15425,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1,
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -16802,7 +15442,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %v
define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0:
@@ -16816,16 +15456,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16839,15 +15477,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16860,16 +15496,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16883,15 +15517,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16904,16 +15536,14 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16927,15 +15557,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -16948,7 +15576,7 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double
define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3:
@@ -16962,16 +15590,14 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8
define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
-; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -16985,15 +15611,13 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve
define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
; SKX: # %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p