aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/atom-fixup-lea2.ll1
-rw-r--r--test/CodeGen/X86/combine-sdiv.ll604
-rw-r--r--test/CodeGen/X86/combine-shl.ll46
-rw-r--r--test/CodeGen/X86/dagcombine-select.ll72
-rw-r--r--test/CodeGen/X86/fast-isel-fold-mem.ll9
-rw-r--r--test/CodeGen/X86/fast-isel-select.ll19
-rw-r--r--test/CodeGen/X86/fast-isel-sext-zext.ll40
-rw-r--r--test/CodeGen/X86/flags-copy-lowering.mir119
-rw-r--r--test/CodeGen/X86/lea-opt.ll151
-rw-r--r--test/CodeGen/X86/machine-outliner-tailcalls.ll2
-rw-r--r--test/CodeGen/X86/mul-constant-i16.ll44
-rw-r--r--test/CodeGen/X86/mul-constant-i32.ll112
-rw-r--r--test/CodeGen/X86/mul-constant-i64.ll138
-rw-r--r--test/CodeGen/X86/pku.ll16
-rw-r--r--test/CodeGen/X86/pmaddubsw.ll553
-rw-r--r--test/CodeGen/X86/rem.ll8
-rw-r--r--test/CodeGen/X86/rotate-extract-vector.ll122
-rw-r--r--test/CodeGen/X86/rotate-extract.ll52
-rw-r--r--test/CodeGen/X86/signbit-shift.ll36
-rw-r--r--test/CodeGen/X86/speculative-load-hardening.ll60
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-128.ll36
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-256.ll48
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-512.ll48
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-128.ll36
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-256.ll48
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-512.ll48
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll49
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll40
-rw-r--r--test/CodeGen/X86/win_coreclr_chkstk.ll7
-rw-r--r--test/CodeGen/X86/win_coreclr_chkstk_liveins.mir24
30 files changed, 1704 insertions, 884 deletions
diff --git a/test/CodeGen/X86/atom-fixup-lea2.ll b/test/CodeGen/X86/atom-fixup-lea2.ll
index 9b0b472be0f3..b8a0369a45f4 100644
--- a/test/CodeGen/X86/atom-fixup-lea2.ll
+++ b/test/CodeGen/X86/atom-fixup-lea2.ll
@@ -1,5 +1,4 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-; RUN: llc < %s -mcpu=goldmont -mtriple=i686-linux | FileCheck %s
; CHECK:%bb.5
; CHECK-NEXT:leal
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index cc99d71009c6..7f0573c6175c 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -285,43 +285,23 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrlw $4, %xmm3
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [49408,32992,24736,57408,49408,32992,24736,57408]
-; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrlw $2, %xmm3
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE-NEXT: paddb %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrlw $1, %xmm3
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE-NEXT: paddb %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE-NEXT: paddb %xmm1, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psraw $4, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psraw $2, %xmm4
-; SSE-NEXT: paddw %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psraw $1, %xmm4
-; SSE-NEXT: paddw %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2]
+; SSE-NEXT: pmullw %xmm2, %xmm3
; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm3
+; SSE-NEXT: paddb %xmm1, %xmm3
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: psraw $4, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: psraw $2, %xmm4
@@ -332,9 +312,23 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; SSE-NEXT: paddw %xmm0, %xmm0
; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psraw $4, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psraw $2, %xmm4
+; SSE-NEXT: paddw %xmm0, %xmm0
+; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psraw $1, %xmm4
+; SSE-NEXT: paddw %xmm0, %xmm0
+; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE-NEXT: psrlw $8, %xmm3
+; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -342,18 +336,15 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [49408,32992,24736,57408,49408,32992,24736,57408]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3
@@ -387,18 +378,11 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [49408,32992,24736,57408,49408,32992,24736,57408]
-; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-NEXT: vpsraw $4, %xmm2, %xmm3
@@ -426,6 +410,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
@@ -481,18 +466,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psrlw $4, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5,6],xmm1[7]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $2, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2],xmm1[3,4],xmm2[5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psraw $4, %xmm2
@@ -510,14 +484,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4],xmm1[5,6],xmm2[7]
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7]
@@ -531,10 +498,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
@@ -547,9 +511,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
@@ -583,70 +545,44 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
+; SSE-NEXT: pmulhuw %xmm3, %xmm2
+; SSE-NEXT: paddw %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psraw $4, %xmm4
+; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4],xmm4[5,6],xmm2[7]
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: psraw $2, %xmm5
+; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: psraw $1, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: psraw $15, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: psrlw $4, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4],xmm3[5,6],xmm0[7]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlw $2, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7]
-; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: pmulhuw %xmm3, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psraw $4, %xmm3
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3],xmm0[4],xmm3[5,6],xmm0[7]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psraw $2, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psraw $15, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: psrlw $4, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5,6],xmm2[7]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrlw $2, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: psrlw $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
-; SSE-NEXT: paddw %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psraw $4, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psraw $2, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: psraw $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: psraw $2, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psraw $1, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7]
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6],xmm3[7]
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
+; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7]
@@ -655,14 +591,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6],xmm3[7]
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7]
@@ -680,26 +609,17 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm4
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm4, %ymm4
+; AVX2-NEXT: vpaddw %ymm4, %ymm0, %ymm4
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
-; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
-; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
-; AVX2-NEXT: vpsravd %ymm5, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
-; AVX2-NEXT: vpsravd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
+; AVX2-NEXT: vpsravd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
@@ -708,9 +628,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
@@ -753,93 +671,50 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psrlw $8, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: psrlw $4, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5,6],xmm0[7]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psrlw $2, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0],xmm5[1,2],xmm0[3,4],xmm5[5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2]
+; SSE-NEXT: pmulhuw %xmm5, %xmm0
; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psraw $4, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psraw $2, %xmm6
-; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; SSE-NEXT: movdqa %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: psraw $4, %xmm6
+; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3],xmm0[4],xmm6[5,6],xmm0[7]
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: psraw $2, %xmm7
+; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2],xmm6[3],xmm0[4,5],xmm6[6],xmm0[7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm4, %xmm1
; SSE-NEXT: psraw $15, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrlw $8, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm1
-; SSE-NEXT: psrlw $4, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4],xmm5[5,6],xmm1[7]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrlw $2, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3,4],xmm5[5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm1
-; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7]
+; SSE-NEXT: pmulhuw %xmm5, %xmm1
; SSE-NEXT: paddw %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psraw $4, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3],xmm1[4],xmm5[5,6],xmm1[7]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psraw $2, %xmm6
-; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: psraw $4, %xmm6
+; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3],xmm1[4],xmm6[5,6],xmm1[7]
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: psraw $2, %xmm7
+; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: psraw $1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: psraw $15, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psrlw $8, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrlw $4, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5,6],xmm4[7]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psrlw $2, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrlw $1, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7]
+; SSE-NEXT: pmulhuw %xmm5, %xmm4
; SSE-NEXT: paddw %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psraw $4, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psraw $2, %xmm6
-; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; SSE-NEXT: movdqa %xmm6, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: psraw $4, %xmm6
+; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4],xmm6[5,6],xmm4[7]
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: psraw $2, %xmm7
+; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; SSE-NEXT: movdqa %xmm7, %xmm4
; SSE-NEXT: psraw $1, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
+; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: psraw $15, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: psrlw $8, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrlw $4, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4],xmm5[5,6],xmm2[7]
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: psrlw $2, %xmm5
-; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1,2],xmm2[3,4],xmm5[5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrlw $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
+; SSE-NEXT: pmulhuw %xmm5, %xmm2
; SSE-NEXT: paddw %xmm3, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm5
; SSE-NEXT: psraw $4, %xmm5
@@ -857,54 +732,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4],xmm3[5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3,4],xmm4[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4],xmm3[5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3,4],xmm4[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
-; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm3
-; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsraw $15, %xmm3, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4],xmm4[5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2]
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7]
@@ -912,16 +743,9 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpsraw $15, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4],xmm4[5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7]
-; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7]
; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
@@ -929,51 +753,62 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm3
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpsraw $15, %xmm3, %xmm5
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7]
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm5
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $4, %xmm2, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX1-NEXT: vpsraw $2, %xmm2, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm5
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
-; AVX2-NEXT: vpsrlvd %ymm4, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
-; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm5
-; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT: vpackusdw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpaddw %ymm5, %ymm0, %ymm5
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
-; AVX2-NEXT: vpsravd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
+; AVX2-NEXT: vpsravd %ymm4, %ymm7, %ymm7
+; AVX2-NEXT: vpsrld $16, %ymm7, %ymm7
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
-; AVX2-NEXT: vpsravd %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX2-NEXT: vpackusdw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpackusdw %ymm7, %ymm5, %ymm5
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15]
; AVX2-NEXT: vpsraw $15, %ymm1, %ymm5
+; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpaddw %ymm5, %ymm1, %ymm5
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
-; AVX2-NEXT: vpsrlvd %ymm4, %ymm6, %ymm4
-; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
-; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpackusdw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
-; AVX2-NEXT: vpsravd %ymm8, %ymm4, %ymm4
+; AVX2-NEXT: vpsravd %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX2-NEXT: vpsravd %ymm7, %ymm2, %ymm2
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
+; AVX2-NEXT: vpsravd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX2-NEXT: vpackusdw %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
@@ -982,11 +817,9 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15]
-; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
@@ -995,9 +828,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
@@ -2021,43 +1852,22 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtb %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlw $4, %xmm4
-; SSE-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [256,224,256,224,57600,57568,8416,8416]
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlw $2, %xmm4
-; SSE-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE-NEXT: paddb %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlw $1, %xmm4
-; SSE-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE-NEXT: paddb %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE-NEXT: paddb %xmm1, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psraw $4, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psraw $2, %xmm5
-; SSE-NEXT: paddw %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: psraw $1, %xmm5
-; SSE-NEXT: paddw %xmm0, %xmm0
-; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm4
; SSE-NEXT: psrlw $8, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm4
+; SSE-NEXT: paddb %xmm1, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
; SSE-NEXT: movdqa %xmm3, %xmm5
; SSE-NEXT: psraw $4, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm5
; SSE-NEXT: psraw $2, %xmm5
@@ -2068,9 +1878,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE-NEXT: paddw %xmm0, %xmm0
; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: packuswb %xmm4, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: psraw $4, %xmm5
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: psraw $2, %xmm5
+; SSE-NEXT: paddw %xmm0, %xmm0
+; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: psraw $1, %xmm5
+; SSE-NEXT: paddw %xmm0, %xmm0
+; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4
+; SSE-NEXT: psrlw $8, %xmm4
+; SSE-NEXT: packuswb %xmm3, %xmm4
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
-; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
; SSE-NEXT: psubb %xmm1, %xmm2
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
; SSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
@@ -2081,18 +1905,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,224,256,224,57600,57568,8416,8416]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
@@ -2129,18 +1949,11 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [256,224,256,224,57600,57568,8416,8416]
-; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4
@@ -2171,6 +1984,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: non_splat_minus_one_divisor_1:
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index c037b0f0aa4b..1fc557f008fe 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -337,30 +337,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_shl_zext_lshr1:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -385,18 +362,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
;
; SSE41-LABEL: combine_vec_shl_zext_lshr1:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -406,13 +372,9 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
;
; AVX-LABEL: combine_vec_shl_zext_lshr1:
; AVX: # %bb.0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,8]
-; AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
%2 = zext <8 x i16> %1 to <8 x i32>
diff --git a/test/CodeGen/X86/dagcombine-select.ll b/test/CodeGen/X86/dagcombine-select.ll
index 2b05154676e6..1ef6cfdfd40a 100644
--- a/test/CodeGen/X86/dagcombine-select.ll
+++ b/test/CodeGen/X86/dagcombine-select.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -verify-machineinstrs -mattr=+bmi < %s | FileCheck -check-prefix=BMI -enable-var-scope %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NOBMI -enable-var-scope
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -mattr=+bmi | FileCheck %s -check-prefixes=CHECK,BMI -enable-var-scope
define i32 @select_and1(i32 %x, i32 %y) {
; CHECK-LABEL: select_and1:
@@ -283,14 +283,14 @@ define double @frem_constant_sel_constants(i1 %cond) {
declare i64 @llvm.cttz.i64(i64, i1)
define i64 @cttz_64_eq_select(i64 %v) nounwind {
-; CHECK-LABEL: cttz_64_eq_select:
-; CHECK: # %bb.0:
-; CHECK-NEXT: bsfq %rdi, %rcx
-; CHECK-NEXT: movq $-1, %rax
-; CHECK-NEXT: cmovneq %rcx, %rax
-; CHECK-NEXT: addq $6, %rax
-; CHECK-NEXT: retq
-
+; NOBMI-LABEL: cttz_64_eq_select:
+; NOBMI: # %bb.0:
+; NOBMI-NEXT: bsfq %rdi, %rcx
+; NOBMI-NEXT: movq $-1, %rax
+; NOBMI-NEXT: cmovneq %rcx, %rax
+; NOBMI-NEXT: addq $6, %rax
+; NOBMI-NEXT: retq
+;
; BMI-LABEL: cttz_64_eq_select:
; BMI: # %bb.0:
; BMI-NEXT: tzcntq %rdi, %rcx
@@ -298,6 +298,7 @@ define i64 @cttz_64_eq_select(i64 %v) nounwind {
; BMI-NEXT: cmovaeq %rcx, %rax
; BMI-NEXT: addq $6, %rax
; BMI-NEXT: retq
+
%cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
%tobool = icmp eq i64 %v, 0
%.op = add nuw nsw i64 %cnt, 6
@@ -306,14 +307,14 @@ define i64 @cttz_64_eq_select(i64 %v) nounwind {
}
define i64 @cttz_64_ne_select(i64 %v) nounwind {
-; CHECK-LABEL: cttz_64_ne_select:
-; CHECK: # %bb.0:
-; CHECK-NEXT: bsfq %rdi, %rcx
-; CHECK-NEXT: movq $-1, %rax
-; CHECK-NEXT: cmovneq %rcx, %rax
-; CHECK-NEXT: addq $6, %rax
-; CHECK-NEXT: retq
-
+; NOBMI-LABEL: cttz_64_ne_select:
+; NOBMI: # %bb.0:
+; NOBMI-NEXT: bsfq %rdi, %rcx
+; NOBMI-NEXT: movq $-1, %rax
+; NOBMI-NEXT: cmovneq %rcx, %rax
+; NOBMI-NEXT: addq $6, %rax
+; NOBMI-NEXT: retq
+;
; BMI-LABEL: cttz_64_ne_select:
; BMI: # %bb.0:
; BMI-NEXT: tzcntq %rdi, %rcx
@@ -321,6 +322,7 @@ define i64 @cttz_64_ne_select(i64 %v) nounwind {
; BMI-NEXT: cmovaeq %rcx, %rax
; BMI-NEXT: addq $6, %rax
; BMI-NEXT: retq
+
%cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
%tobool = icmp ne i64 %v, 0
%.op = add nuw nsw i64 %cnt, 6
@@ -330,14 +332,14 @@ define i64 @cttz_64_ne_select(i64 %v) nounwind {
declare i32 @llvm.cttz.i32(i32, i1)
define i32 @cttz_32_eq_select(i32 %v) nounwind {
-; CHECK-LABEL: cttz_32_eq_select:
-; CHECK: # %bb.0:
-; CHECK-NEXT: bsfl %edi, %ecx
-; CHECK-NEXT: movl $-1, %eax
-; CHECK-NEXT: cmovnel %ecx, %eax
-; CHECK-NEXT: addl $6, %eax
-; CHECK-NEXT: retq
-
+; NOBMI-LABEL: cttz_32_eq_select:
+; NOBMI: # %bb.0:
+; NOBMI-NEXT: bsfl %edi, %ecx
+; NOBMI-NEXT: movl $-1, %eax
+; NOBMI-NEXT: cmovnel %ecx, %eax
+; NOBMI-NEXT: addl $6, %eax
+; NOBMI-NEXT: retq
+;
; BMI-LABEL: cttz_32_eq_select:
; BMI: # %bb.0:
; BMI-NEXT: tzcntl %edi, %ecx
@@ -345,6 +347,7 @@ define i32 @cttz_32_eq_select(i32 %v) nounwind {
; BMI-NEXT: cmovael %ecx, %eax
; BMI-NEXT: addl $6, %eax
; BMI-NEXT: retq
+
%cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
%tobool = icmp eq i32 %v, 0
%.op = add nuw nsw i32 %cnt, 6
@@ -353,14 +356,14 @@ define i32 @cttz_32_eq_select(i32 %v) nounwind {
}
define i32 @cttz_32_ne_select(i32 %v) nounwind {
-; CHECK-LABEL: cttz_32_ne_select:
-; CHECK: # %bb.0:
-; CHECK-NEXT: bsfl %edi, %ecx
-; CHECK-NEXT: movl $-1, %eax
-; CHECK-NEXT: cmovnel %ecx, %eax
-; CHECK-NEXT: addl $6, %eax
-; CHECK-NEXT: retq
-
+; NOBMI-LABEL: cttz_32_ne_select:
+; NOBMI: # %bb.0:
+; NOBMI-NEXT: bsfl %edi, %ecx
+; NOBMI-NEXT: movl $-1, %eax
+; NOBMI-NEXT: cmovnel %ecx, %eax
+; NOBMI-NEXT: addl $6, %eax
+; NOBMI-NEXT: retq
+;
; BMI-LABEL: cttz_32_ne_select:
; BMI: # %bb.0:
; BMI-NEXT: tzcntl %edi, %ecx
@@ -368,6 +371,7 @@ define i32 @cttz_32_ne_select(i32 %v) nounwind {
; BMI-NEXT: cmovael %ecx, %eax
; BMI-NEXT: addl $6, %eax
; BMI-NEXT: retq
+
%cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
%tobool = icmp ne i32 %v, 0
%.op = add nuw nsw i32 %cnt, 6
diff --git a/test/CodeGen/X86/fast-isel-fold-mem.ll b/test/CodeGen/X86/fast-isel-fold-mem.ll
index 5686484ef935..1c5171926c4b 100644
--- a/test/CodeGen/X86/fast-isel-fold-mem.ll
+++ b/test/CodeGen/X86/fast-isel-fold-mem.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin | FileCheck %s
define i64 @fold_load(i64* %a, i64 %b) {
-; CHECK-LABEL: fold_load
-; CHECK: addq (%rdi), %rsi
-; CHECK-NEXT: movq %rsi, %rax
+; CHECK-LABEL: fold_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: addq (%rdi), %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
%1 = load i64, i64* %a, align 8
%2 = add i64 %1, %b
ret i64 %2
diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll
index 7b3c99f13cca..cf459f85b33e 100644
--- a/test/CodeGen/X86/fast-isel-select.ll
+++ b/test/CodeGen/X86/fast-isel-select.ll
@@ -1,14 +1,23 @@
-; RUN: llc -mtriple x86_64-apple-darwin -O0 -o - < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -O0 | FileCheck %s
+
; Make sure we only use the less significant bit of the value that feeds the
; select. Otherwise, we may account for a non-zero value whereas the
; lsb is zero.
; <rdar://problem/15651765>
-; CHECK-LABEL: fastisel_select:
-; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]]
-; CHECK: testb $1, [[RES]]
-; CHECK: cmovnel %edi, %esi
define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) {
+; CHECK-LABEL: fastisel_select:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movb %sil, %al
+; CHECK-NEXT: movb %dil, %cl
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: subb %al, %cl
+; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: movl $1204476887, %edi ## imm = 0x47CADBD7
+; CHECK-NEXT: cmovnel %edi, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
%shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766
%counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0
ret i32 %counter_diff1345
diff --git a/test/CodeGen/X86/fast-isel-sext-zext.ll b/test/CodeGen/X86/fast-isel-sext-zext.ll
index 5e54c98b0d14..82ed6c72ebca 100644
--- a/test/CodeGen/X86/fast-isel-sext-zext.ll
+++ b/test/CodeGen/X86/fast-isel-sext-zext.ll
@@ -9,7 +9,6 @@ define i8 @test1(i8 %x) nounwind {
; X32-NEXT: andb $1, %al
; X32-NEXT: negb %al
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test1:
; X64: ## %bb.0:
@@ -17,7 +16,6 @@ define i8 @test1(i8 %x) nounwind {
; X64-NEXT: negb %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i8 %x to i1
%u = sext i1 %z to i8
ret i8 %u
@@ -32,7 +30,6 @@ define i16 @test2(i16 %x) nounwind {
; X32-NEXT: movsbl %al, %eax
; X32-NEXT: ## kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test2:
; X64: ## %bb.0:
@@ -41,7 +38,6 @@ define i16 @test2(i16 %x) nounwind {
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i16 %x to i1
%u = sext i1 %z to i16
ret i16 %u
@@ -55,7 +51,6 @@ define i32 @test3(i32 %x) nounwind {
; X32-NEXT: negb %al
; X32-NEXT: movsbl %al, %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test3:
; X64: ## %bb.0:
@@ -63,7 +58,6 @@ define i32 @test3(i32 %x) nounwind {
; X64-NEXT: negb %dil
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i32 %x to i1
%u = sext i1 %z to i32
ret i32 %u
@@ -77,7 +71,6 @@ define i32 @test4(i32 %x) nounwind {
; X32-NEXT: negb %al
; X32-NEXT: movsbl %al, %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test4:
; X64: ## %bb.0:
@@ -85,7 +78,6 @@ define i32 @test4(i32 %x) nounwind {
; X64-NEXT: negb %dil
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i32 %x to i1
%u = sext i1 %z to i32
ret i32 %u
@@ -97,14 +89,12 @@ define i8 @test5(i8 %x) nounwind {
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $1, %al
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test5:
; X64: ## %bb.0:
; X64-NEXT: andb $1, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i8 %x to i1
%u = zext i1 %z to i8
ret i8 %u
@@ -118,7 +108,6 @@ define i16 @test6(i16 %x) nounwind {
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: ## kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test6:
; X64: ## %bb.0:
@@ -126,7 +115,6 @@ define i16 @test6(i16 %x) nounwind {
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i16 %x to i1
%u = zext i1 %z to i16
ret i16 %u
@@ -139,14 +127,12 @@ define i32 @test7(i32 %x) nounwind {
; X32-NEXT: andb $1, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test7:
; X64: ## %bb.0:
; X64-NEXT: andb $1, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i32 %x to i1
%u = zext i1 %z to i32
ret i32 %u
@@ -159,14 +145,12 @@ define i32 @test8(i32 %x) nounwind {
; X32-NEXT: andb $1, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test8:
; X64: ## %bb.0:
; X64-NEXT: andb $1, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%z = trunc i32 %x to i1
%u = zext i1 %z to i32
ret i32 %u
@@ -178,14 +162,12 @@ define i16 @test9(i8 %x) nounwind {
; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: ## kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test9:
; X64: ## %bb.0:
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i8 %x to i16
ret i16 %u
}
@@ -195,13 +177,11 @@ define i32 @test10(i8 %x) nounwind {
; X32: ## %bb.0:
; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test10:
; X64: ## %bb.0:
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i8 %x to i32
ret i32 %u
}
@@ -213,13 +193,11 @@ define i64 @test11(i8 %x) nounwind {
; X32-NEXT: movl %eax, %edx
; X32-NEXT: sarl $31, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test11:
; X64: ## %bb.0:
; X64-NEXT: movsbq %dil, %rax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i8 %x to i64
ret i64 %u
}
@@ -230,14 +208,12 @@ define i16 @test12(i8 %x) nounwind {
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: ## kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test12:
; X64: ## %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i8 %x to i16
ret i16 %u
}
@@ -247,13 +223,11 @@ define i32 @test13(i8 %x) nounwind {
; X32: ## %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test13:
; X64: ## %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i8 %x to i32
ret i32 %u
}
@@ -264,13 +238,11 @@ define i64 @test14(i8 %x) nounwind {
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test14:
; X64: ## %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i8 %x to i64
ret i64 %u
}
@@ -280,13 +252,11 @@ define i32 @test15(i16 %x) nounwind {
; X32: ## %bb.0:
; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test15:
; X64: ## %bb.0:
; X64-NEXT: movswl %di, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i16 %x to i32
ret i32 %u
}
@@ -298,13 +268,11 @@ define i64 @test16(i16 %x) nounwind {
; X32-NEXT: movl %eax, %edx
; X32-NEXT: sarl $31, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test16:
; X64: ## %bb.0:
; X64-NEXT: movswq %di, %rax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i16 %x to i64
ret i64 %u
}
@@ -314,13 +282,11 @@ define i32 @test17(i16 %x) nounwind {
; X32: ## %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test17:
; X64: ## %bb.0:
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i16 %x to i32
ret i32 %u
}
@@ -331,13 +297,11 @@ define i64 @test18(i16 %x) nounwind {
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test18:
; X64: ## %bb.0:
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i16 %x to i64
ret i64 %u
}
@@ -349,13 +313,11 @@ define i64 @test19(i32 %x) nounwind {
; X32-NEXT: movl %eax, %edx
; X32-NEXT: sarl $31, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test19:
; X64: ## %bb.0:
; X64-NEXT: movslq %edi, %rax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = sext i32 %x to i64
ret i64 %u
}
@@ -366,13 +328,11 @@ define i64 @test20(i32 %x) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
-; X32-NEXT: ## -- End function
;
; X64-LABEL: test20:
; X64: ## %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
-; X64-NEXT: ## -- End function
%u = zext i32 %x to i64
ret i64 %u
}
diff --git a/test/CodeGen/X86/flags-copy-lowering.mir b/test/CodeGen/X86/flags-copy-lowering.mir
index 54ce02aaca58..d5991754d40b 100644
--- a/test/CodeGen/X86/flags-copy-lowering.mir
+++ b/test/CodeGen/X86/flags-copy-lowering.mir
@@ -90,6 +90,18 @@
call void @foo()
ret i64 0
}
+
+ define i32 @test_existing_setcc(i64 %a, i64 %b) {
+ entry:
+ call void @foo()
+ ret i32 0
+ }
+
+ define i32 @test_existing_setcc_memory(i64 %a, i64 %b) {
+ entry:
+ call void @foo()
+ ret i32 0
+ }
...
---
name: test_branch
@@ -936,3 +948,110 @@ body: |
; CHECK: %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
...
+---
+name: test_existing_setcc
+# CHECK-LABEL: name: test_existing_setcc
+liveins:
+ - { reg: '$rdi', virtual-reg: '%0' }
+ - { reg: '$rsi', virtual-reg: '%1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2, %bb.3
+ liveins: $rdi, $rsi
+
+ %0:gr64 = COPY $rdi
+ %1:gr64 = COPY $rsi
+ CMP64rr %0, %1, implicit-def $eflags
+ %2:gr8 = SETAr implicit $eflags
+ %3:gr8 = SETAEr implicit $eflags
+ %4:gr64 = COPY $eflags
+ ; CHECK: CMP64rr %0, %1, implicit-def $eflags
+ ; CHECK-NEXT: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
+ ; CHECK-NEXT: %[[AE_REG:[^:]*]]:gr8 = SETAEr implicit $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+ ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+ $eflags = COPY %4
+ JA_1 %bb.1, implicit $eflags
+ JB_1 %bb.2, implicit $eflags
+ JMP_1 %bb.3
+ ; CHECK-NOT: $eflags =
+ ;
+ ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JNE_1 %bb.1, implicit killed $eflags
+ ; CHECK-SAME: {{$[[:space:]]}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: {{.*$}}
+ ; CHECK-SAME: {{$[[:space:]]}}
+ ; CHECK-NEXT: TEST8rr %[[AE_REG]], %[[AE_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JE_1 %bb.2, implicit killed $eflags
+ ; CHECK-NEXT: JMP_1 %bb.3
+
+ bb.1:
+ %5:gr32 = MOV32ri64 42
+ $eax = COPY %5
+ RET 0, $eax
+
+ bb.2:
+ %6:gr32 = MOV32ri64 43
+ $eax = COPY %6
+ RET 0, $eax
+
+ bb.3:
+ %7:gr32 = MOV32r0 implicit-def dead $eflags
+ $eax = COPY %7
+ RET 0, $eax
+
+...
+---
+name: test_existing_setcc_memory
+# CHECK-LABEL: name: test_existing_setcc_memory
+liveins:
+ - { reg: '$rdi', virtual-reg: '%0' }
+ - { reg: '$rsi', virtual-reg: '%1' }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $rdi, $rsi
+
+ %0:gr64 = COPY $rdi
+ %1:gr64 = COPY $rsi
+ CMP64rr %0, %1, implicit-def $eflags
+ SETEm %0, 1, $noreg, -16, $noreg, implicit $eflags
+ %2:gr64 = COPY $eflags
+ ; CHECK: CMP64rr %0, %1, implicit-def $eflags
+ ; We cannot reuse this SETE because it stores the flag directly to memory,
+ ; so we have two SETEs here. FIXME: It'd be great if something could fold
+ ; these automatically. If not, maybe we want to unfold SETcc instructions
+ ; writing to memory so we can reuse them.
+ ; CHECK-NEXT: SETEm {{.*}} implicit $eflags
+ ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+ ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+ $eflags = COPY %2
+ JE_1 %bb.1, implicit $eflags
+ JMP_1 %bb.2
+ ; CHECK-NOT: $eflags =
+ ;
+ ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JNE_1 %bb.1, implicit killed $eflags
+ ; CHECK-NEXT: JMP_1 %bb.2
+
+ bb.1:
+ %3:gr32 = MOV32ri64 42
+ $eax = COPY %3
+ RET 0, $eax
+
+ bb.2:
+ %4:gr32 = MOV32ri64 43
+ $eax = COPY %4
+ RET 0, $eax
+
+...
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
index b285a4ed5224..6899babf31de 100644
--- a/test/CodeGen/X86/lea-opt.ll
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -307,3 +307,154 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
}
+
+define i32 @test5(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: subl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 %y, -2
+ %add = add nsw i32 %mul, %x
+ ret i32 %add
+}
+
+define i32 @test6(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (%rsi,%rsi,2), %eax
+; CHECK-NEXT: subl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 %y, -3
+ %add = add nsw i32 %mul, %x
+ ret i32 %add
+}
+
+define i32 @test7(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shll $2, %esi
+; CHECK-NEXT: subl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 %y, -4
+ %add = add nsw i32 %mul, %x
+ ret i32 %add
+}
+
+define i32 @test8(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (,%rsi,4), %eax
+; CHECK-NEXT: subl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = shl nsw i32 %y, 2
+ %sub = sub nsw i32 %mul, %x
+ ret i32 %sub
+}
+
+
+define i32 @test9(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test9:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: subl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 -2, %y
+ %add = add nsw i32 %x, %mul
+ ret i32 %add
+}
+
+define i32 @test10(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test10:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (%rsi,%rsi,2), %eax
+; CHECK-NEXT: subl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 -3, %y
+ %add = add nsw i32 %x, %mul
+ ret i32 %add
+}
+
+define i32 @test11(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test11:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shll $2, %esi
+; CHECK-NEXT: subl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 -4, %y
+ %add = add nsw i32 %x, %mul
+ ret i32 %add
+}
+
+define i32 @test12(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test12:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (,%rsi,4), %eax
+; CHECK-NEXT: subl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 4, %y
+ %sub = sub nsw i32 %mul, %x
+ ret i32 %sub
+}
+
+define i64 @test13(i64 %x, i64 %y) #0 {
+; CHECK-LABEL: test13:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shlq $2, %rsi
+; CHECK-NEXT: subq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i64 -4, %y
+ %add = add nsw i64 %x, %mul
+ ret i64 %add
+}
+
+define i32 @test14(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: test14:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (,%rsi,4), %eax
+; CHECK-NEXT: subl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %mul = mul nsw i32 4, %y
+ %sub = sub nsw i32 %mul, %x
+ ret i32 %sub
+}
+
+define zeroext i16 @test15(i16 zeroext %x, i16 zeroext %y) #0 {
+; CHECK-LABEL: test15:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: subl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %conv = zext i16 %x to i32
+ %conv1 = zext i16 %y to i32
+ %mul = mul nsw i32 -8, %conv1
+ %add = add nsw i32 %conv, %mul
+ %conv2 = trunc i32 %add to i16
+ ret i16 %conv2
+}
+
+attributes #0 = { norecurse nounwind optsize readnone uwtable}
diff --git a/test/CodeGen/X86/machine-outliner-tailcalls.ll b/test/CodeGen/X86/machine-outliner-tailcalls.ll
index 6f28354c386b..71ebade623cf 100644
--- a/test/CodeGen/X86/machine-outliner-tailcalls.ll
+++ b/test/CodeGen/X86/machine-outliner-tailcalls.ll
@@ -1,4 +1,4 @@
-; RUN: llc -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s
@x = common local_unnamed_addr global i32 0, align 4
diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll
index 737bcc7c864d..cf367ecbb98e 100644
--- a/test/CodeGen/X86/mul-constant-i16.ll
+++ b/test/CodeGen/X86/mul-constant-i16.ll
@@ -766,6 +766,50 @@ define i16 @test_mul_by_520(i16 %x) {
ret i16 %mul
}
+define i16 @test_mul_by_neg10(i16 %x) {
+; X86-LABEL: test_mul_by_neg10:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: leal (%eax,%eax,4), %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test_mul_by_neg10:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: addl %edi, %edi
+; X64-NEXT: leal (%rdi,%rdi,4), %eax
+; X64-NEXT: negl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %mul = mul nsw i16 %x, -10
+ ret i16 %mul
+}
+
+define i16 @test_mul_by_neg36(i16 %x) {
+; X86-LABEL: test_mul_by_neg36:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $2, %eax
+; X86-NEXT: leal (%eax,%eax,8), %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test_mul_by_neg36:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shll $2, %edi
+; X64-NEXT: leal (%rdi,%rdi,8), %eax
+; X64-NEXT: negl %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %mul = mul nsw i16 %x, -36
+ ret i16 %mul
+}
+
; (x*9+42)*(x*5+2)
define i16 @test_mul_spec(i16 %x) nounwind {
; X86-LABEL: test_mul_spec:
diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll
index 356d5a00abf6..04f867bb4e16 100644
--- a/test/CodeGen/X86/mul-constant-i32.ll
+++ b/test/CodeGen/X86/mul-constant-i32.ll
@@ -1997,6 +1997,118 @@ define i32 @test_mul_by_520(i32 %x) {
ret i32 %mul
}
+define i32 @test_mul_by_neg10(i32 %x) {
+; X86-LABEL: test_mul_by_neg10:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: leal (%eax,%eax,4), %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: retl
+;
+; X64-HSW-LABEL: test_mul_by_neg10:
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT: negl %eax # sched: [1:0.25]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_neg10:
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
+; X64-JAG-NEXT: negl %eax # sched: [1:0.50]
+; X64-JAG-NEXT: retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_neg10:
+; X86-NOOPT: # %bb.0:
+; X86-NOOPT-NEXT: imull $-10, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT: retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_neg10:
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_neg10:
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_neg10:
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT: negl %eax # sched: [1:0.50]
+; X64-SLM-NEXT: retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_neg10:
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+ %mul = mul nsw i32 %x, -10
+ ret i32 %mul
+}
+
+define i32 @test_mul_by_neg36(i32 %x) {
+; X86-LABEL: test_mul_by_neg36:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $2, %eax
+; X86-NEXT: leal (%eax,%eax,8), %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: retl
+;
+; X64-HSW-LABEL: test_mul_by_neg36:
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT: negl %eax # sched: [1:0.25]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_neg36:
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
+; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
+; X64-JAG-NEXT: negl %eax # sched: [1:0.50]
+; X64-JAG-NEXT: retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_neg36:
+; X86-NOOPT: # %bb.0:
+; X86-NOOPT-NEXT: imull $-36, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT: retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_neg36:
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_neg36:
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_neg36:
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
+; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT: negl %eax # sched: [1:0.50]
+; X64-SLM-NEXT: retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_neg36:
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+ %mul = mul nsw i32 %x, -36
+ ret i32 %mul
+}
+
; (x*9+42)*(x*5+2)
define i32 @test_mul_spec(i32 %x) nounwind {
; X86-LABEL: test_mul_spec:
diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll
index 332ad7f01299..761ca67ab31b 100644
--- a/test/CodeGen/X86/mul-constant-i64.ll
+++ b/test/CodeGen/X86/mul-constant-i64.ll
@@ -2107,6 +2107,144 @@ define i64 @test_mul_by_520(i64 %x) {
ret i64 %mul
}
+define i64 @test_mul_by_neg10(i64 %x) {
+; X86-LABEL: test_mul_by_neg10:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $-10, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: leal (%esi,%esi,4), %ecx
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+;
+; X64-HSW-LABEL: test_mul_by_neg10:
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT: negq %rax # sched: [1:0.25]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_neg10:
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [2:1.00]
+; X64-JAG-NEXT: negq %rax # sched: [1:0.50]
+; X64-JAG-NEXT: retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_neg10:
+; X86-NOOPT: # %bb.0:
+; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT: movl $-10, %edx
+; X86-NOOPT-NEXT: movl %ecx, %eax
+; X86-NOOPT-NEXT: mull %edx
+; X86-NOOPT-NEXT: subl %ecx, %edx
+; X86-NOOPT-NEXT: imull $-10, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT: addl %ecx, %edx
+; X86-NOOPT-NEXT: retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_neg10:
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_neg10:
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [6:4.00]
+; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_neg10:
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT: negq %rax # sched: [1:0.50]
+; X64-SLM-NEXT: retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_neg10:
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+ %mul = mul nsw i64 %x, -10
+ ret i64 %mul
+}
+
+define i64 @test_mul_by_neg36(i64 %x) {
+; X86-LABEL: test_mul_by_neg36:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $-36, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: leal (%esi,%esi,8), %ecx
+; X86-NEXT: shll $2, %ecx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+;
+; X64-HSW-LABEL: test_mul_by_neg36:
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT: negq %rax # sched: [1:0.25]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_neg36:
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [2:1.00]
+; X64-JAG-NEXT: negq %rax # sched: [1:0.50]
+; X64-JAG-NEXT: retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_neg36:
+; X86-NOOPT: # %bb.0:
+; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT: movl $-36, %edx
+; X86-NOOPT-NEXT: movl %ecx, %eax
+; X86-NOOPT-NEXT: mull %edx
+; X86-NOOPT-NEXT: subl %ecx, %edx
+; X86-NOOPT-NEXT: imull $-36, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT: addl %ecx, %edx
+; X86-NOOPT-NEXT: retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_neg36:
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_neg36:
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [6:4.00]
+; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_neg36:
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT: negq %rax # sched: [1:0.50]
+; X64-SLM-NEXT: retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_neg36:
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
+ %mul = mul nsw i64 %x, -36
+ ret i64 %mul
+}
+
; (x*9+42)*(x*5+2)
define i64 @test_mul_spec(i64 %x) nounwind {
; X86-LABEL: test_mul_spec:
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
index 96ee97341749..6031bafb0972 100644
--- a/test/CodeGen/X86/pku.ll
+++ b/test/CodeGen/X86/pku.ll
@@ -26,17 +26,11 @@ define void @test_x86_wrpkru(i32 %src) {
}
define i32 @test_x86_rdpkru() {
-; X86-LABEL: test_x86_rdpkru:
-; X86: ## %bb.0:
-; X86-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9]
-; X86-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee]
-; X86-NEXT: retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_rdpkru:
-; X64: ## %bb.0:
-; X64-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9]
-; X64-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee]
-; X64-NEXT: retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_rdpkru:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9]
+; CHECK-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call i32 @llvm.x86.rdpkru()
ret i32 %res
}
diff --git a/test/CodeGen/X86/pmaddubsw.ll b/test/CodeGen/X86/pmaddubsw.ll
new file mode 100644
index 000000000000..d44315af2c6b
--- /dev/null
+++ b/test/CodeGen/X86/pmaddubsw.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
+
+; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
+; This would require the combine to recreate the concat_vectors.
+define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_128:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: pmaddubsw (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: pmaddubsw_128:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rsi), %xmm0
+; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
+
+define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_256:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: movdqa 16(%rsi), %xmm1
+; SSE-NEXT: pmaddubsw (%rdi), %xmm0
+; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: pmaddubsw_256:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa (%rsi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: pmaddubsw_256:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
+; AVX256-NEXT: retq
+ %A = load <32 x i8>, <32 x i8>* %Aptr
+ %B = load <32 x i8>, <32 x i8>* %Bptr
+ %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %A_even_ext = sext <16 x i8> %A_even to <16 x i32>
+ %B_even_ext = zext <16 x i8> %B_even to <16 x i32>
+ %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32>
+ %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32>
+ %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <16 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <16 x i32> %min to <16 x i16>
+ ret <16 x i16> %trunc
+}
+
+define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_512:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa 112(%rdx), %xmm0
+; SSE-NEXT: movdqa 96(%rdx), %xmm1
+; SSE-NEXT: movdqa 80(%rdx), %xmm2
+; SSE-NEXT: movdqa 64(%rdx), %xmm3
+; SSE-NEXT: movdqa (%rdx), %xmm4
+; SSE-NEXT: movdqa 16(%rdx), %xmm5
+; SSE-NEXT: movdqa 32(%rdx), %xmm6
+; SSE-NEXT: movdqa 48(%rdx), %xmm7
+; SSE-NEXT: pmaddubsw (%rsi), %xmm4
+; SSE-NEXT: pmaddubsw 16(%rsi), %xmm5
+; SSE-NEXT: pmaddubsw 32(%rsi), %xmm6
+; SSE-NEXT: pmaddubsw 48(%rsi), %xmm7
+; SSE-NEXT: pmaddubsw 64(%rsi), %xmm3
+; SSE-NEXT: pmaddubsw 80(%rsi), %xmm2
+; SSE-NEXT: pmaddubsw 96(%rsi), %xmm1
+; SSE-NEXT: pmaddubsw 112(%rsi), %xmm0
+; SSE-NEXT: movdqa %xmm0, 112(%rdi)
+; SSE-NEXT: movdqa %xmm1, 96(%rdi)
+; SSE-NEXT: movdqa %xmm2, 80(%rdi)
+; SSE-NEXT: movdqa %xmm3, 64(%rdi)
+; SSE-NEXT: movdqa %xmm7, 48(%rdi)
+; SSE-NEXT: movdqa %xmm6, 32(%rdi)
+; SSE-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE-NEXT: movdqa %xmm4, (%rdi)
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: pmaddubsw_512:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8
+; AVX1-NEXT: vmovdqa (%rsi), %ymm4
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5
+; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6
+; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm8, %xmm9, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: pmaddubsw_512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rsi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2
+; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3
+; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2
+; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: pmaddubsw_512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2
+; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3
+; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2
+; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: pmaddubsw_512:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
+; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1
+; AVX512BW-NEXT: retq
+ %A = load <128 x i8>, <128 x i8>* %Aptr
+ %B = load <128 x i8>, <128 x i8>* %Bptr
+ %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
+ %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
+ %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
+ %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
+ %A_even_ext = sext <64 x i8> %A_even to <64 x i32>
+ %B_even_ext = zext <64 x i8> %B_even to <64 x i32>
+ %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32>
+ %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32>
+ %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <64 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <64 x i32> %min to <64 x i16>
+ ret <64 x i16> %trunc
+}
+
+define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_swapped_indices:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: pmaddubsw (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: pmaddubsw_swapped_indices:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rsi), %xmm0
+; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A
+ %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
+
+define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_swapped_extend:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: pmaddubsw (%rsi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: pmaddubsw_swapped_extend:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %A_even_ext = zext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = sext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
+
+define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_commuted_mul:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: pmaddubsw (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: pmaddubsw_commuted_mul:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rsi), %xmm0
+; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
+
+define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_bad_extend:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllw $8, %xmm3
+; SSE-NEXT: psraw $8, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pmulhw %xmm2, %xmm4
+; SSE-NEXT: pmullw %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pmulhw %xmm0, %xmm4
+; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: pmaddubsw_bad_extend:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa (%rsi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: pmaddubsw_bad_extend:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa (%rsi), %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: pmaddubsw_bad_extend:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
+; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
+
+define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
+; SSE-LABEL: pmaddubsw_bad_indices:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa (%rsi), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
+; SSE-NEXT: psraw $8, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pmulhw %xmm2, %xmm4
+; SSE-NEXT: pmullw %xmm2, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
+; SSE-NEXT: psraw $8, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pmulhw %xmm0, %xmm4
+; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: pmaddubsw_bad_indices:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa (%rsi), %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: pmaddubsw_bad_indices:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa (%rsi), %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: pmaddubsw_bad_indices:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
+; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %A = load <16 x i8>, <16 x i8>* %Aptr
+ %B = load <16 x i8>, <16 x i8>* %Bptr
+ %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
+ %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
+ %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A
+ %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A
+ %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
+ %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
+ %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
+ %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
+ %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
+ %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
+ %add = add <8 x i32> %even_mul, %odd_mul
+ %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
+ %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+ %trunc = trunc <8 x i32> %min to <8 x i16>
+ ret <8 x i16> %trunc
+}
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 672baa5c1bdc..5f2cc199bcf4 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -15,8 +15,8 @@ define i32 @test1(i32 %X) {
; CHECK-NEXT: addl %eax, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: subl %edx, %eax
-; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: subl %eax, %edx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = srem i32 %X, 255
@@ -48,8 +48,8 @@ define i32 @test3(i32 %X) {
; CHECK-NEXT: shrl $7, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: subl %edx, %eax
-; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: subl %eax, %edx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = urem i32 %X, 255
diff --git a/test/CodeGen/X86/rotate-extract-vector.ll b/test/CodeGen/X86/rotate-extract-vector.ll
index 6059a76259ba..e2679dded8b5 100644
--- a/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/test/CodeGen/X86/rotate-extract-vector.ll
@@ -12,10 +12,10 @@
define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
; CHECK-LABEL: vroll_v4i32_extract_shl:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm1
-; CHECK-NEXT: vpslld $10, %xmm0, %xmm0
-; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1
-; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
+; CHECK-NEXT: vprold $7, %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
%rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
@@ -25,20 +25,12 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
}
define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
-; X86-LABEL: vrolq_v4i64_extract_shrl:
-; X86: # %bb.0:
-; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X86-NEXT: vprolq $24, %zmm0, %zmm0
-; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
-; X86-NEXT: retl
-;
-; X64-LABEL: vrolq_v4i64_extract_shrl:
-; X64: # %bb.0:
-; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-NEXT: vprolq $24, %zmm0, %zmm0
-; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919]
-; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: vrolq_v4i64_extract_shrl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
+; CHECK-NEXT: vprolq $29, %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: ret{{[l|q]}}
%lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
%rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
%rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
@@ -49,12 +41,10 @@ define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
; CHECK-LABEL: vroll_extract_mul:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640]
-; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10]
-; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0
-; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
+; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vprold $6, %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
%rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -66,7 +56,7 @@ define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
; X86-LABEL: vrolq_extract_udiv:
; X86: # %bb.0:
-; X86-NEXT: subl $60, %esp
+; X86-NEXT: subl $44, %esp
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -85,53 +75,27 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: vextractps $2, %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: vpsllq $57, %xmm1, %xmm1
-; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X86-NEXT: addl $60, %esp
+; X86-NEXT: vprolq $57, %zmm0, %zmm0
+; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X86-NEXT: addl $44, %esp
+; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: vrolq_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: vmovq %rax, %xmm1
+; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: mulq %rcx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: vmovq %rax, %xmm0
+; X64-NEXT: mulq %rcx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: shrq $8, %rcx
-; X64-NEXT: vmovq %rcx, %xmm1
-; X64-NEXT: shrq $8, %rdx
-; X64-NEXT: vmovq %rdx, %xmm2
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-NEXT: vpsllq $57, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-NEXT: vprolq $57, %zmm0, %zmm0
+; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NEXT: vzeroupper
; X64-NEXT: retq
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
%rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
@@ -141,17 +105,23 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
}
define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
-; CHECK-LABEL: vrolw_extract_mul_with_mask:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152]
-; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
-; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160]
-; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0
-; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: ret{{[l|q]}}
+; X86-LABEL: vrolw_extract_mul_with_mask:
+; X86: # %bb.0:
+; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
+; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X86-NEXT: vprold $7, %zmm0, %zmm0
+; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: vrolw_extract_mul_with_mask:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
+; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT: vprold $7, %zmm0, %zmm0
+; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
%rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
%lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
diff --git a/test/CodeGen/X86/rotate-extract.ll b/test/CodeGen/X86/rotate-extract.ll
index 6ce3db13e954..a1babd1d3cc3 100644
--- a/test/CodeGen/X86/rotate-extract.ll
+++ b/test/CodeGen/X86/rotate-extract.ll
@@ -24,9 +24,7 @@ define i64 @rolq_extract_shl(i64 %i) nounwind {
; X64-LABEL: rolq_extract_shl:
; X64: # %bb.0:
; X64-NEXT: leaq (,%rdi,8), %rax
-; X64-NEXT: shlq $10, %rdi
-; X64-NEXT: shrq $57, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: rolq $7, %rax
; X64-NEXT: retq
%lhs_mul = shl i64 %i, 3
%rhs_mul = shl i64 %i, 10
@@ -39,16 +37,17 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind {
; X86-LABEL: rolw_extract_shrl:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: rolw $9, %ax
-; X86-NEXT: andl $61951, %eax # imm = 0xF1FF
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: rolw $12, %ax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: rolw_extract_shrl:
; X64: # %bb.0:
-; X64-NEXT: rolw $9, %di
-; X64-NEXT: andl $61951, %edi # imm = 0xF1FF
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: rolw $12, %ax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%lhs_div = lshr i16 %i, 7
%rhs_div = lshr i16 %i, 3
@@ -60,22 +59,16 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind {
define i32 @roll_extract_mul(i32 %i) nounwind {
; X86-LABEL: roll_extract_mul:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %eax
-; X86-NEXT: shll $7, %ecx
-; X86-NEXT: leal (%ecx,%ecx,8), %ecx
-; X86-NEXT: shrl $25, %eax
-; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal (%eax,%eax,8), %eax
+; X86-NEXT: roll $7, %eax
; X86-NEXT: retl
;
; X64-LABEL: roll_extract_mul:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: shll $7, %edi
-; X64-NEXT: leal (%rdi,%rdi,8), %ecx
-; X64-NEXT: shrl $25, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: roll $7, %eax
; X64-NEXT: retq
%lhs_mul = mul i32 %i, 9
%rhs_mul = mul i32 %i, 1152
@@ -89,11 +82,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: movb %ah, %cl
-; X86-NEXT: shlb $3, %cl
-; X86-NEXT: andb $-16, %cl
-; X86-NEXT: shrl $13, %eax
-; X86-NEXT: orb %cl, %al
+; X86-NEXT: shrl $9, %eax
+; X86-NEXT: rolb $4, %al
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
@@ -101,12 +91,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $171, %eax, %eax
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $8, %ecx
-; X64-NEXT: shlb $3, %cl
-; X64-NEXT: andb $-16, %cl
-; X64-NEXT: shrl $13, %eax
-; X64-NEXT: orb %cl, %al
+; X64-NEXT: shrl $9, %eax
+; X64-NEXT: rolb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
@@ -139,12 +125,8 @@ define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind {
; X64-LABEL: rolq_extract_mul_with_mask:
; X64: # %bb.0:
; X64-NEXT: leaq (%rdi,%rdi,8), %rax
-; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
-; X64-NEXT: shll $7, %edi
-; X64-NEXT: leal (%rdi,%rdi,8), %ecx
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: shrq $57, %rax
-; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: rolq $7, %rax
+; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%lhs_mul = mul i64 %i, 1152
%rhs_mul = mul i64 %i, 9
diff --git a/test/CodeGen/X86/signbit-shift.ll b/test/CodeGen/X86/signbit-shift.ll
index cee647931bcb..1579a77a2e9b 100644
--- a/test/CodeGen/X86/signbit-shift.ll
+++ b/test/CodeGen/X86/signbit-shift.ll
@@ -156,9 +156,9 @@ define i32 @sext_ifneg(i32 %x) {
define i32 @add_sext_ifneg(i32 %x) {
; CHECK-LABEL: add_sext_ifneg:
; CHECK: # %bb.0:
-; CHECK-NEXT: shrl $31, %edi
-; CHECK-NEXT: movl $42, %eax
-; CHECK-NEXT: subl %edi, %eax
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: leal 42(%rdi), %eax
; CHECK-NEXT: retq
%c = icmp slt i32 %x, 0
%e = sext i1 %c to i32
@@ -169,9 +169,9 @@ define i32 @add_sext_ifneg(i32 %x) {
define i32 @sel_ifneg_fval_bigger(i32 %x) {
; CHECK-LABEL: sel_ifneg_fval_bigger:
; CHECK: # %bb.0:
-; CHECK-NEXT: shrl $31, %edi
-; CHECK-NEXT: movl $42, %eax
-; CHECK-NEXT: subl %edi, %eax
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: leal 42(%rdi), %eax
; CHECK-NEXT: retq
%c = icmp slt i32 %x, 0
%r = select i1 %c, i32 41, i32 42
@@ -231,9 +231,10 @@ define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
define i32 @sub_lshr(i32 %x, i32 %y) {
; CHECK-LABEL: sub_lshr:
; CHECK: # %bb.0:
-; CHECK-NEXT: shrl $31, %edi
-; CHECK-NEXT: subl %edi, %esi
-; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: retq
%sh = lshr i32 %x, 31
%r = sub i32 %y, %sh
@@ -243,9 +244,8 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: sub_lshr_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: psrld $31, %xmm0
-; CHECK-NEXT: psubd %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: psrad $31, %xmm0
+; CHECK-NEXT: paddd %xmm1, %xmm0
; CHECK-NEXT: retq
%sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
%r = sub <4 x i32> %y, %sh
@@ -255,9 +255,9 @@ define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
define i32 @sub_const_op_lshr(i32 %x) {
; CHECK-LABEL: sub_const_op_lshr:
; CHECK: # %bb.0:
-; CHECK-NEXT: shrl $31, %edi
-; CHECK-NEXT: xorl $43, %edi
-; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: leal 43(%rdi), %eax
; CHECK-NEXT: retq
%sh = lshr i32 %x, 31
%r = sub i32 43, %sh
@@ -267,10 +267,8 @@ define i32 @sub_const_op_lshr(i32 %x) {
define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
; CHECK-LABEL: sub_const_op_lshr_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: psrld $31, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-NEXT: psubd %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: psrad $31, %xmm0
+; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
%r = sub <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %sh
diff --git a/test/CodeGen/X86/speculative-load-hardening.ll b/test/CodeGen/X86/speculative-load-hardening.ll
index 45b9c2f29807..55f7949c0da0 100644
--- a/test/CodeGen/X86/speculative-load-hardening.ll
+++ b/test/CodeGen/X86/speculative-load-hardening.ll
@@ -8,7 +8,7 @@ declare void @leak(i32 %v1, i32 %v2)
declare void @sink(i32)
-define i32 @test_trivial_entry_load(i32* %ptr) nounwind {
+define i32 @test_trivial_entry_load(i32* %ptr) {
; X64-LABEL: test_trivial_entry_load:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rsp, %rcx
@@ -29,12 +29,18 @@ entry:
ret i32 %v
}
-define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) nounwind {
+define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) {
; X64-LABEL: test_basic_conditions:
; X64: # %bb.0: # %entry
; X64-NEXT: pushq %r15
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
+; X64-NEXT: .cfi_offset %r14, -24
+; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rsp, %rax
; X64-NEXT: movq $-1, %rbx
; X64-NEXT: sarq $63, %rax
@@ -50,10 +56,14 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: popq %r15
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
; X64-NEXT: .LBB1_4: # %then2
+; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: movq %r8, %r15
; X64-NEXT: cmovneq %rbx, %rax
; X64-NEXT: testl %edx, %edx
@@ -90,19 +100,21 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr
;
; X64-LFENCE-LABEL: test_basic_conditions:
; X64-LFENCE: # %bb.0: # %entry
+; X64-LFENCE-NEXT: pushq %r14
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16
+; X64-LFENCE-NEXT: pushq %rbx
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24
+; X64-LFENCE-NEXT: pushq %rax
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32
+; X64-LFENCE-NEXT: .cfi_offset %rbx, -24
+; X64-LFENCE-NEXT: .cfi_offset %r14, -16
; X64-LFENCE-NEXT: testl %edi, %edi
; X64-LFENCE-NEXT: jne .LBB1_6
; X64-LFENCE-NEXT: # %bb.1: # %then1
; X64-LFENCE-NEXT: lfence
; X64-LFENCE-NEXT: testl %esi, %esi
-; X64-LFENCE-NEXT: je .LBB1_2
-; X64-LFENCE-NEXT: .LBB1_6: # %exit
-; X64-LFENCE-NEXT: lfence
-; X64-LFENCE-NEXT: retq
-; X64-LFENCE-NEXT: .LBB1_2: # %then2
-; X64-LFENCE-NEXT: pushq %r14
-; X64-LFENCE-NEXT: pushq %rbx
-; X64-LFENCE-NEXT: pushq %rax
+; X64-LFENCE-NEXT: jne .LBB1_6
+; X64-LFENCE-NEXT: # %bb.2: # %then2
; X64-LFENCE-NEXT: movq %r8, %rbx
; X64-LFENCE-NEXT: lfence
; X64-LFENCE-NEXT: testl %edx, %edx
@@ -126,10 +138,14 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr
; X64-LFENCE-NEXT: .LBB1_5: # %merge
; X64-LFENCE-NEXT: movslq (%r14), %rax
; X64-LFENCE-NEXT: movl $0, (%rbx,%rax,4)
+; X64-LFENCE-NEXT: .LBB1_6: # %exit
+; X64-LFENCE-NEXT: lfence
; X64-LFENCE-NEXT: addq $8, %rsp
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24
; X64-LFENCE-NEXT: popq %rbx
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16
; X64-LFENCE-NEXT: popq %r14
-; X64-LFENCE-NEXT: lfence
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 8
; X64-LFENCE-NEXT: retq
entry:
%a.cmp = icmp eq i32 %a, 0
@@ -465,12 +481,18 @@ declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr
declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
-define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; X64-LABEL: test_basic_eh:
; X64: # %bb.0: # %entry
; X64-NEXT: pushq %rbp
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
+; X64-NEXT: .cfi_offset %r14, -24
+; X64-NEXT: .cfi_offset %rbp, -16
; X64-NEXT: movq %rsp, %rax
; X64-NEXT: movq $-1, %rcx
; X64-NEXT: sarq $63, %rax
@@ -507,10 +529,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality
; X64-NEXT: shlq $47, %rax
; X64-NEXT: orq %rax, %rsp
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: popq %rbp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
; X64-NEXT: .LBB4_4: # %lpad
+; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: .Ltmp2:
; X64-NEXT: movq %rsp, %rcx
; X64-NEXT: sarq $63, %rcx
@@ -529,8 +555,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality
; X64-LFENCE-LABEL: test_basic_eh:
; X64-LFENCE: # %bb.0: # %entry
; X64-LFENCE-NEXT: pushq %rbp
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16
; X64-LFENCE-NEXT: pushq %r14
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24
; X64-LFENCE-NEXT: pushq %rbx
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32
+; X64-LFENCE-NEXT: .cfi_offset %rbx, -32
+; X64-LFENCE-NEXT: .cfi_offset %r14, -24
+; X64-LFENCE-NEXT: .cfi_offset %rbp, -16
; X64-LFENCE-NEXT: cmpl $41, %edi
; X64-LFENCE-NEXT: jg .LBB4_2
; X64-LFENCE-NEXT: # %bb.1: # %thrower
@@ -551,10 +583,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality
; X64-LFENCE-NEXT: .LBB4_2: # %exit
; X64-LFENCE-NEXT: lfence
; X64-LFENCE-NEXT: popq %rbx
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24
; X64-LFENCE-NEXT: popq %r14
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16
; X64-LFENCE-NEXT: popq %rbp
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 8
; X64-LFENCE-NEXT: retq
; X64-LFENCE-NEXT: .LBB4_3: # %lpad
+; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32
; X64-LFENCE-NEXT: .Ltmp2:
; X64-LFENCE-NEXT: movl (%rax), %eax
; X64-LFENCE-NEXT: addl (%rbx), %eax
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 2416a177228e..3f251dd8d62c 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -301,9 +301,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: leaq (,%rdx,8), %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
@@ -313,9 +313,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: leaq (,%rdx,8), %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -331,9 +331,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
; SSE41-NEXT: leaq (,%rdx,8), %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: imulq %rsi
@@ -342,9 +342,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
; SSE41-NEXT: leaq (,%rdx,8), %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -359,9 +359,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -370,9 +370,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = srem <2 x i64> %a, <i64 7, i64 7>
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index c112e84fbf73..5df4d09e9715 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -263,9 +263,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: imulq %rsi
@@ -274,9 +274,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
@@ -286,9 +286,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: imulq %rsi
@@ -297,9 +297,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -316,9 +316,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: imulq %rsi
@@ -327,9 +327,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
@@ -339,9 +339,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: imulq %rsi
@@ -350,9 +350,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 310e1fc7057a..893c7d1bbd7b 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -214,9 +214,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -225,9 +225,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
@@ -238,9 +238,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -249,9 +249,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -263,9 +263,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -274,9 +274,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
@@ -286,9 +286,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -297,9 +297,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index c991a905c054..598782ddd639 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -278,9 +278,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: shrq $2, %rax
; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rax, %rdx
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: addq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
@@ -291,9 +291,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: shrq $2, %rax
; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rax, %rdx
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: addq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -310,9 +310,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: addq %rdx, %rax
; SSE41-NEXT: shrq $2, %rax
; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rax, %rdx
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: addq %rcx, %rax
+; SSE41-NEXT: movq %rax, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
@@ -322,9 +322,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: addq %rdx, %rax
; SSE41-NEXT: shrq $2, %rax
; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rax, %rdx
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: addq %rcx, %rax
+; SSE41-NEXT: movq %rax, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -340,9 +340,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -352,9 +352,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = urem <2 x i64> %a, <i64 7, i64 7>
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 81d93984e261..377ff5ea77af 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -264,9 +264,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
@@ -276,9 +276,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
@@ -289,9 +289,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
@@ -301,9 +301,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -321,9 +321,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
@@ -333,9 +333,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
@@ -346,9 +346,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
@@ -358,9 +358,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 1288f5a5d5be..22c359cb7e98 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -218,9 +218,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -230,9 +230,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
@@ -244,9 +244,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -256,9 +256,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -271,9 +271,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -283,9 +283,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
@@ -296,9 +296,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -308,9 +308,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 90a0c6f291b2..b50680ff56ee 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1008,36 +1008,16 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: constant_shift_v8i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i16:
; XOP: # %bb.0:
@@ -1046,11 +1026,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512DQ-LABEL: constant_shift_v8i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i16:
@@ -1064,10 +1041,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512DQVL-LABEL: constant_shift_v8i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i16:
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index f0f0bb8a8819..3ca714d7f830 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1025,21 +1025,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v16i16:
@@ -1102,21 +1092,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; X32-AVX1-LABEL: constant_shift_v16i16:
; X32-AVX1: # %bb.0:
-; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v16i16:
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index 8934535d6f52..24f2b2be4308 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -10,8 +10,6 @@ entry:
; WIN_X64-LABEL:main4k:
; WIN_X64: # %bb.0:
; WIN_X64: movl $4096, %eax
-; WIN_X64: movq %rcx, 8(%rsp)
-; WIN_X64: movq %rdx, 16(%rsp)
; WIN_X64: xorq %rcx, %rcx
; WIN_X64: movq %rsp, %rdx
; WIN_X64: subq %rax, %rdx
@@ -27,8 +25,6 @@ entry:
; WIN_X64: cmpq %rcx, %rdx
; WIN_X64: jne .LBB0_2
; WIN_X64:.LBB0_3:
-; WIN_X64: movq 8(%rsp), %rcx
-; WIN_X64: movq 16(%rsp), %rdx
; WIN_X64: subq %rax, %rsp
; WIN_X64: xorl %eax, %eax
; WIN_X64: addq $4096, %rsp
@@ -45,7 +41,6 @@ entry:
define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" {
entry:
; WIN_X64-LABEL:main4k_frame:
-; WIN_X64: movq %rcx, 16(%rsp)
; WIN_X64: movq %gs:16, %rcx
; LINUX-LABEL:main4k_frame:
; LINUX-NOT: movq %gs:16, %rcx
@@ -58,7 +53,6 @@ entry:
; Case with INT args
define i32 @main4k_intargs(i32 %x, i32 %y) nounwind {
entry:
-; WIN_X64: movq %rcx, 8(%rsp)
; WIN_X64: movq %gs:16, %rcx
; LINUX-NOT: movq %gs:16, %rcx
; LINUX: retq
@@ -71,7 +65,6 @@ entry:
; Case with FP regs
define i32 @main4k_fpargs(double %x, double %y) nounwind {
entry:
-; WIN_X64: movq %rcx, 8(%rsp)
; WIN_X64: movq %gs:16, %rcx
; LINUX-NOT: movq %gs:16, %rcx
; LINUX: retq
diff --git a/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir b/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir
new file mode 100644
index 000000000000..8da5f895063f
--- /dev/null
+++ b/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir
@@ -0,0 +1,24 @@
+# RUN: llc -verify-machineinstrs %s -run-pass prologepilog -mtriple=x86_64-pc-win32-coreclr -o - | FileCheck %s
+...
+---
+name: main4k
+# CHECK-LABEL: name: main4k
+
+alignment: 4
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 8
+stack:
+ - { id: 0, size: 4096, alignment: 1, stack-id: 0 }
+body: |
+ bb.0.entry:
+ $eax = IMPLICIT_DEF
+ RET 0, killed $eax
+
+ ; CHECK: bb.1.entry:
+ ; CHECK: liveins: $rdx
+ ; CHECK: bb.2.entry:
+ ; CHECK: liveins: $rcx, $rdx
+ ; CHECK: bb.3.entry:
+ ; CHECK: liveins: $rax
+...