diff options
Diffstat (limited to 'test/CodeGen/X86')
30 files changed, 1704 insertions, 884 deletions
diff --git a/test/CodeGen/X86/atom-fixup-lea2.ll b/test/CodeGen/X86/atom-fixup-lea2.ll index 9b0b472be0f3..b8a0369a45f4 100644 --- a/test/CodeGen/X86/atom-fixup-lea2.ll +++ b/test/CodeGen/X86/atom-fixup-lea2.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s -; RUN: llc < %s -mcpu=goldmont -mtriple=i686-linux | FileCheck %s ; CHECK:%bb.5 ; CHECK-NEXT:leal diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll index cc99d71009c6..7f0573c6175c 100644 --- a/test/CodeGen/X86/combine-sdiv.ll +++ b/test/CodeGen/X86/combine-sdiv.ll @@ -285,43 +285,23 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrlw $4, %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [49408,32992,24736,57408,49408,32992,24736,57408] -; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrlw $2, %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrlw $1, %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psraw $4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psraw $2, %xmm4 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psraw $1, %xmm4 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2] +; SSE-NEXT: pmullw %xmm2, %xmm3 ; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psraw $4, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16384,32800,41056,8384,16384,32800,41056,8384] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psraw $2, %xmm4 @@ -332,9 +312,23 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; SSE-NEXT: paddw %xmm0, %xmm0 ; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 ; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psraw $4, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psraw $2, %xmm4 +; SSE-NEXT: paddw %xmm0, %xmm0 +; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psraw $1, %xmm4 +; SSE-NEXT: paddw %xmm0, %xmm0 +; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -342,18 +336,15 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [49408,32992,24736,57408,49408,32992,24736,57408] -; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] +; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3 @@ -387,18 +378,11 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [49408,32992,24736,57408,49408,32992,24736,57408] -; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-NEXT: vpsraw $4, %xmm2, %xmm3 @@ -426,6 +410,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8: @@ -481,18 +466,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5,6],xmm1[7] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlw $2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2],xmm1[3,4],xmm2[5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psraw $4, %xmm2 @@ -510,14 +484,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4],xmm1[5,6],xmm2[7] -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] @@ -531,10 +498,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 @@ -547,9 +511,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1 @@ -583,70 +545,44 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; SSE-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psraw $15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] +; SSE-NEXT: pmulhuw %xmm3, %xmm2 +; SSE-NEXT: paddw %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psraw $4, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4],xmm4[5,6],xmm2[7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psraw $2, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psraw $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7] +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psraw $15, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4],xmm3[5,6],xmm0[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlw $2, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: psrlw $1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] -; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pmulhuw %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psraw $4, %xmm3 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3],xmm0[4],xmm3[5,6],xmm0[7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psraw $2, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: psraw $1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psraw $15, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrlw $4, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5,6],xmm2[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrlw $2, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrlw $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; SSE-NEXT: paddw %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psraw $4, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psraw $2, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psraw $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psraw $2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psraw $1, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6],xmm3[7] -; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] @@ -655,14 +591,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6],xmm3[7] -; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7] @@ -680,26 +609,17 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm4 +; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm4, %ymm4 +; AVX2-NEXT: vpaddw %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] -; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] -; AVX2-NEXT: vpsrlvd %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX2-NEXT: vpsravd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX2-NEXT: vpsravd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX2-NEXT: vpsravd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpackusdw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] @@ -708,9 +628,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1 @@ -753,93 +671,50 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrlw $8, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5,6],xmm0[7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psrlw $2, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0],xmm5[1,2],xmm0[3,4],xmm5[5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrlw $1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2] +; SSE-NEXT: pmulhuw %xmm5, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psraw $4, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psraw $2, %xmm6 -; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psraw $4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3],xmm0[4],xmm6[5,6],xmm0[7] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: psraw $2, %xmm7 +; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: psraw $1, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2],xmm6[3],xmm0[4,5],xmm6[6],xmm0[7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrlw $8, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4],xmm5[5,6],xmm1[7] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrlw $2, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3,4],xmm5[5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] +; SSE-NEXT: pmulhuw %xmm5, %xmm1 ; SSE-NEXT: paddw %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psraw $4, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3],xmm1[4],xmm5[5,6],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psraw $2, %xmm6 -; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psraw $4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3],xmm1[4],xmm6[5,6],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: psraw $2, %xmm7 +; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psraw $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psraw $15, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psrlw $8, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrlw $4, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5,6],xmm4[7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psrlw $2, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrlw $1, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; SSE-NEXT: pmulhuw %xmm5, %xmm4 ; SSE-NEXT: paddw %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psraw $4, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psraw $2, %xmm6 -; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: psraw $4, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4],xmm6[5,6],xmm4[7] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: psraw $2, %xmm7 +; SSE-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: psraw $1, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] ; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: psraw $15, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: psrlw $8, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlw $4, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4],xmm5[5,6],xmm2[7] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: psrlw $2, %xmm5 -; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1,2],xmm2[3,4],xmm5[5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlw $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7] +; SSE-NEXT: pmulhuw %xmm5, %xmm2 ; SSE-NEXT: paddw %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: psraw $4, %xmm5 @@ -857,54 +732,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4],xmm3[5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $4, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4],xmm3[5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsraw $15, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4],xmm4[5,6],xmm5[7] -; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,2,16,8,32,64,2] +; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4],xmm4[5,6],xmm3[7] @@ -912,16 +743,9 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpsraw $15, %xmm1, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3,4],xmm4[5,6],xmm5[7] -; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] -; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 @@ -929,51 +753,62 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpsraw $15, %xmm3, %xmm5 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] +; AVX1-NEXT: vpsraw $15, %xmm1, %xmm5 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $4, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX1-NEXT: vpsraw $2, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX1-NEXT: vpsraw $1, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] -; AVX2-NEXT: vpsrlvd %ymm4, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] -; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX2-NEXT: vpackusdw %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpaddw %ymm5, %ymm0, %ymm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] -; AVX2-NEXT: vpsravd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX2-NEXT: vpsravd %ymm4, %ymm7, %ymm7 +; AVX2-NEXT: vpsrld $16, %ymm7, %ymm7 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] -; AVX2-NEXT: vpsravd %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vpsravd %ymm3, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 -; AVX2-NEXT: vpackusdw %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpackusdw %ymm7, %ymm5, %ymm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm5 +; AVX2-NEXT: vpmulhuw %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpaddw %ymm5, %ymm1, %ymm5 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] -; AVX2-NEXT: vpsrlvd %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] -; AVX2-NEXT: vpsrlvd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 -; AVX2-NEXT: vpackusdw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX2-NEXT: vpsravd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpsravd %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX2-NEXT: vpsravd %ymm7, %ymm2, %ymm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX2-NEXT: vpsravd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 ; AVX2-NEXT: vpackusdw %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] @@ -982,11 +817,9 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,14,15,12,13,11,10,15,16,14,15,12,13,11,10,15] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] @@ -995,9 +828,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2 @@ -2021,43 +1852,22 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pcmpgtb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlw $4, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [256,224,256,224,57600,57568,8416,8416] -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlw $2, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: paddb %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlw $1, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: paddb %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE-NEXT: paddb %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psraw $4, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psraw $2, %xmm5 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psraw $1, %xmm5 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm4 ; SSE-NEXT: psrlw $8, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: paddb %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: psraw $4, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,32,0,32,8192,8224,57376,57376] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: psraw $2, %xmm5 @@ -2068,9 +1878,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE-NEXT: paddw %xmm0, %xmm0 ; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 ; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psraw $4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psraw $2, %xmm5 +; SSE-NEXT: paddw %xmm0, %xmm0 +; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psraw $1, %xmm5 +; SSE-NEXT: paddw %xmm0, %xmm0 +; SSE-NEXT: pblendvb %xmm0, %xmm5, %xmm4 +; SSE-NEXT: psrlw $8, %xmm4 +; SSE-NEXT: packuswb %xmm3, %xmm4 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1 ; SSE-NEXT: psubb %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] ; SSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 @@ -2081,18 +1905,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,224,256,224,57600,57568,8416,8416] -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 @@ -2129,18 +1949,11 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [256,224,256,224,57600,57568,8416,8416] -; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $2, %xmm2, %xmm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 @@ -2171,6 +1984,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: non_splat_minus_one_divisor_1: diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll index c037b0f0aa4b..1fc557f008fe 100644 --- a/test/CodeGen/X86/combine-shl.ll +++ b/test/CodeGen/X86/combine-shl.ll @@ -337,30 +337,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_shl_zext_lshr1: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -385,18 +362,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; ; SSE41-LABEL: combine_vec_shl_zext_lshr1: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3,4],xmm1[5,6],xmm0[7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -406,13 +372,9 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; ; AVX-LABEL: combine_vec_shl_zext_lshr1: ; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,8] -; AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> %2 = zext <8 x i16> %1 to <8 x i32> diff --git a/test/CodeGen/X86/dagcombine-select.ll b/test/CodeGen/X86/dagcombine-select.ll index 2b05154676e6..1ef6cfdfd40a 100644 --- a/test/CodeGen/X86/dagcombine-select.ll +++ b/test/CodeGen/X86/dagcombine-select.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -; RUN: llc -mtriple=x86_64-unknown-unknown -verify-machineinstrs -mattr=+bmi < %s | FileCheck -check-prefix=BMI -enable-var-scope %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NOBMI -enable-var-scope +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -mattr=+bmi | FileCheck %s -check-prefixes=CHECK,BMI -enable-var-scope define i32 @select_and1(i32 %x, i32 %y) { ; CHECK-LABEL: select_and1: @@ -283,14 +283,14 @@ define double @frem_constant_sel_constants(i1 %cond) { declare i64 @llvm.cttz.i64(i64, i1) define i64 @cttz_64_eq_select(i64 %v) nounwind { -; CHECK-LABEL: cttz_64_eq_select: -; CHECK: # %bb.0: -; CHECK-NEXT: bsfq %rdi, %rcx -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: cmovneq %rcx, %rax -; CHECK-NEXT: addq $6, %rax -; CHECK-NEXT: retq - +; NOBMI-LABEL: cttz_64_eq_select: +; NOBMI: # %bb.0: +; NOBMI-NEXT: bsfq %rdi, %rcx +; NOBMI-NEXT: movq $-1, %rax +; NOBMI-NEXT: cmovneq %rcx, %rax +; NOBMI-NEXT: addq $6, %rax +; NOBMI-NEXT: retq +; ; BMI-LABEL: cttz_64_eq_select: ; BMI: # %bb.0: ; BMI-NEXT: tzcntq %rdi, %rcx @@ -298,6 +298,7 @@ define i64 @cttz_64_eq_select(i64 %v) nounwind { ; BMI-NEXT: cmovaeq %rcx, %rax ; BMI-NEXT: addq $6, %rax ; BMI-NEXT: retq + %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true) %tobool = icmp eq i64 %v, 0 %.op = add nuw nsw i64 %cnt, 6 @@ -306,14 +307,14 @@ define i64 @cttz_64_eq_select(i64 %v) nounwind { } define i64 @cttz_64_ne_select(i64 %v) nounwind { -; CHECK-LABEL: cttz_64_ne_select: -; CHECK: # %bb.0: -; CHECK-NEXT: bsfq %rdi, %rcx -; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: cmovneq %rcx, %rax -; CHECK-NEXT: addq $6, %rax -; CHECK-NEXT: retq - +; NOBMI-LABEL: cttz_64_ne_select: +; NOBMI: # %bb.0: +; NOBMI-NEXT: bsfq %rdi, %rcx +; NOBMI-NEXT: movq $-1, %rax +; NOBMI-NEXT: cmovneq %rcx, %rax +; NOBMI-NEXT: addq $6, %rax +; NOBMI-NEXT: retq +; ; BMI-LABEL: cttz_64_ne_select: ; BMI: # %bb.0: ; BMI-NEXT: tzcntq %rdi, %rcx @@ -321,6 +322,7 @@ define i64 @cttz_64_ne_select(i64 %v) nounwind { ; BMI-NEXT: cmovaeq %rcx, %rax ; BMI-NEXT: addq $6, %rax ; BMI-NEXT: retq + %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true) %tobool = icmp ne i64 %v, 0 %.op = add nuw nsw i64 %cnt, 6 @@ -330,14 +332,14 @@ define i64 @cttz_64_ne_select(i64 %v) nounwind { declare i32 @llvm.cttz.i32(i32, i1) define i32 @cttz_32_eq_select(i32 %v) nounwind { -; CHECK-LABEL: cttz_32_eq_select: -; CHECK: # %bb.0: -; CHECK-NEXT: bsfl %edi, %ecx -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovnel %ecx, %eax -; CHECK-NEXT: addl $6, %eax -; CHECK-NEXT: retq - +; NOBMI-LABEL: cttz_32_eq_select: +; NOBMI: # %bb.0: +; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: movl $-1, %eax +; NOBMI-NEXT: cmovnel %ecx, %eax +; NOBMI-NEXT: addl $6, %eax +; NOBMI-NEXT: retq +; ; BMI-LABEL: cttz_32_eq_select: ; BMI: # %bb.0: ; BMI-NEXT: tzcntl %edi, %ecx @@ -345,6 +347,7 @@ define i32 @cttz_32_eq_select(i32 %v) nounwind { ; BMI-NEXT: cmovael %ecx, %eax ; BMI-NEXT: addl $6, %eax ; BMI-NEXT: retq + %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true) %tobool = icmp eq i32 %v, 0 %.op = add nuw nsw i32 %cnt, 6 @@ -353,14 +356,14 @@ define i32 @cttz_32_eq_select(i32 %v) nounwind { } define i32 @cttz_32_ne_select(i32 %v) nounwind { -; CHECK-LABEL: cttz_32_ne_select: -; CHECK: # %bb.0: -; CHECK-NEXT: bsfl %edi, %ecx -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovnel %ecx, %eax -; CHECK-NEXT: addl $6, %eax -; CHECK-NEXT: retq - +; NOBMI-LABEL: cttz_32_ne_select: +; NOBMI: # %bb.0: +; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: movl $-1, %eax +; NOBMI-NEXT: cmovnel %ecx, %eax +; NOBMI-NEXT: addl $6, %eax +; NOBMI-NEXT: retq +; ; BMI-LABEL: cttz_32_ne_select: ; BMI: # %bb.0: ; BMI-NEXT: tzcntl %edi, %ecx @@ -368,6 +371,7 @@ define i32 @cttz_32_ne_select(i32 %v) nounwind { ; BMI-NEXT: cmovael %ecx, %eax ; BMI-NEXT: addl $6, %eax ; BMI-NEXT: retq + %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true) %tobool = icmp ne i32 %v, 0 %.op = add nuw nsw i32 %cnt, 6 diff --git a/test/CodeGen/X86/fast-isel-fold-mem.ll b/test/CodeGen/X86/fast-isel-fold-mem.ll index 5686484ef935..1c5171926c4b 100644 --- a/test/CodeGen/X86/fast-isel-fold-mem.ll +++ b/test/CodeGen/X86/fast-isel-fold-mem.ll @@ -1,10 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s ; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin | FileCheck %s define i64 @fold_load(i64* %a, i64 %b) { -; CHECK-LABEL: fold_load -; CHECK: addq (%rdi), %rsi -; CHECK-NEXT: movq %rsi, %rax +; CHECK-LABEL: fold_load: +; CHECK: ## %bb.0: +; CHECK-NEXT: addq (%rdi), %rsi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq %1 = load i64, i64* %a, align 8 %2 = add i64 %1, %b ret i64 %2 diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll index 7b3c99f13cca..cf459f85b33e 100644 --- a/test/CodeGen/X86/fast-isel-select.ll +++ b/test/CodeGen/X86/fast-isel-select.ll @@ -1,14 +1,23 @@ -; RUN: llc -mtriple x86_64-apple-darwin -O0 -o - < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -O0 | FileCheck %s + ; Make sure we only use the less significant bit of the value that feeds the ; select. Otherwise, we may account for a non-zero value whereas the ; lsb is zero. ; <rdar://problem/15651765> -; CHECK-LABEL: fastisel_select: -; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]] -; CHECK: testb $1, [[RES]] -; CHECK: cmovnel %edi, %esi define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) { +; CHECK-LABEL: fastisel_select: +; CHECK: ## %bb.0: +; CHECK-NEXT: movb %sil, %al +; CHECK-NEXT: movb %dil, %cl +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: subb %al, %cl +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: movl $1204476887, %edi ## imm = 0x47CADBD7 +; CHECK-NEXT: cmovnel %edi, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq %shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766 %counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0 ret i32 %counter_diff1345 diff --git a/test/CodeGen/X86/fast-isel-sext-zext.ll b/test/CodeGen/X86/fast-isel-sext-zext.ll index 5e54c98b0d14..82ed6c72ebca 100644 --- a/test/CodeGen/X86/fast-isel-sext-zext.ll +++ b/test/CodeGen/X86/fast-isel-sext-zext.ll @@ -9,7 +9,6 @@ define i8 @test1(i8 %x) nounwind { ; X32-NEXT: andb $1, %al ; X32-NEXT: negb %al ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test1: ; X64: ## %bb.0: @@ -17,7 +16,6 @@ define i8 @test1(i8 %x) nounwind { ; X64-NEXT: negb %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i8 %x to i1 %u = sext i1 %z to i8 ret i8 %u @@ -32,7 +30,6 @@ define i16 @test2(i16 %x) nounwind { ; X32-NEXT: movsbl %al, %eax ; X32-NEXT: ## kill: def $ax killed $ax killed $eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test2: ; X64: ## %bb.0: @@ -41,7 +38,6 @@ define i16 @test2(i16 %x) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i16 %x to i1 %u = sext i1 %z to i16 ret i16 %u @@ -55,7 +51,6 @@ define i32 @test3(i32 %x) nounwind { ; X32-NEXT: negb %al ; X32-NEXT: movsbl %al, %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test3: ; X64: ## %bb.0: @@ -63,7 +58,6 @@ define i32 @test3(i32 %x) nounwind { ; X64-NEXT: negb %dil ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i32 %x to i1 %u = sext i1 %z to i32 ret i32 %u @@ -77,7 +71,6 @@ define i32 @test4(i32 %x) nounwind { ; X32-NEXT: negb %al ; X32-NEXT: movsbl %al, %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test4: ; X64: ## %bb.0: @@ -85,7 +78,6 @@ define i32 @test4(i32 %x) nounwind { ; X64-NEXT: negb %dil ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i32 %x to i1 %u = sext i1 %z to i32 ret i32 %u @@ -97,14 +89,12 @@ define i8 @test5(i8 %x) nounwind { ; X32-NEXT: movb {{[0-9]+}}(%esp), %al ; X32-NEXT: andb $1, %al ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test5: ; X64: ## %bb.0: ; X64-NEXT: andb $1, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i8 %x to i1 %u = zext i1 %z to i8 ret i8 %u @@ -118,7 +108,6 @@ define i16 @test6(i16 %x) nounwind { ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: ## kill: def $ax killed $ax killed $eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test6: ; X64: ## %bb.0: @@ -126,7 +115,6 @@ define i16 @test6(i16 %x) nounwind { ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i16 %x to i1 %u = zext i1 %z to i16 ret i16 %u @@ -139,14 +127,12 @@ define i32 @test7(i32 %x) nounwind { ; X32-NEXT: andb $1, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test7: ; X64: ## %bb.0: ; X64-NEXT: andb $1, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i32 %x to i1 %u = zext i1 %z to i32 ret i32 %u @@ -159,14 +145,12 @@ define i32 @test8(i32 %x) nounwind { ; X32-NEXT: andb $1, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test8: ; X64: ## %bb.0: ; X64-NEXT: andb $1, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %z = trunc i32 %x to i1 %u = zext i1 %z to i32 ret i32 %u @@ -178,14 +162,12 @@ define i16 @test9(i8 %x) nounwind { ; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: ## kill: def $ax killed $ax killed $eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test9: ; X64: ## %bb.0: ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i8 %x to i16 ret i16 %u } @@ -195,13 +177,11 @@ define i32 @test10(i8 %x) nounwind { ; X32: ## %bb.0: ; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test10: ; X64: ## %bb.0: ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i8 %x to i32 ret i32 %u } @@ -213,13 +193,11 @@ define i64 @test11(i8 %x) nounwind { ; X32-NEXT: movl %eax, %edx ; X32-NEXT: sarl $31, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test11: ; X64: ## %bb.0: ; X64-NEXT: movsbq %dil, %rax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i8 %x to i64 ret i64 %u } @@ -230,14 +208,12 @@ define i16 @test12(i8 %x) nounwind { ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: ## kill: def $ax killed $ax killed $eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test12: ; X64: ## %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i8 %x to i16 ret i16 %u } @@ -247,13 +223,11 @@ define i32 @test13(i8 %x) nounwind { ; X32: ## %bb.0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test13: ; X64: ## %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i8 %x to i32 ret i32 %u } @@ -264,13 +238,11 @@ define i64 @test14(i8 %x) nounwind { ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test14: ; X64: ## %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i8 %x to i64 ret i64 %u } @@ -280,13 +252,11 @@ define i32 @test15(i16 %x) nounwind { ; X32: ## %bb.0: ; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test15: ; X64: ## %bb.0: ; X64-NEXT: movswl %di, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i16 %x to i32 ret i32 %u } @@ -298,13 +268,11 @@ define i64 @test16(i16 %x) nounwind { ; X32-NEXT: movl %eax, %edx ; X32-NEXT: sarl $31, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test16: ; X64: ## %bb.0: ; X64-NEXT: movswq %di, %rax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i16 %x to i64 ret i64 %u } @@ -314,13 +282,11 @@ define i32 @test17(i16 %x) nounwind { ; X32: ## %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test17: ; X64: ## %bb.0: ; X64-NEXT: movzwl %di, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i16 %x to i32 ret i32 %u } @@ -331,13 +297,11 @@ define i64 @test18(i16 %x) nounwind { ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test18: ; X64: ## %bb.0: ; X64-NEXT: movzwl %di, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i16 %x to i64 ret i64 %u } @@ -349,13 +313,11 @@ define i64 @test19(i32 %x) nounwind { ; X32-NEXT: movl %eax, %edx ; X32-NEXT: sarl $31, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test19: ; X64: ## %bb.0: ; X64-NEXT: movslq %edi, %rax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = sext i32 %x to i64 ret i64 %u } @@ -366,13 +328,11 @@ define i64 @test20(i32 %x) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl -; X32-NEXT: ## -- End function ; ; X64-LABEL: test20: ; X64: ## %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: retq -; X64-NEXT: ## -- End function %u = zext i32 %x to i64 ret i64 %u } diff --git a/test/CodeGen/X86/flags-copy-lowering.mir b/test/CodeGen/X86/flags-copy-lowering.mir index 54ce02aaca58..d5991754d40b 100644 --- a/test/CodeGen/X86/flags-copy-lowering.mir +++ b/test/CodeGen/X86/flags-copy-lowering.mir @@ -90,6 +90,18 @@ call void @foo() ret i64 0 } + + define i32 @test_existing_setcc(i64 %a, i64 %b) { + entry: + call void @foo() + ret i32 0 + } + + define i32 @test_existing_setcc_memory(i64 %a, i64 %b) { + entry: + call void @foo() + ret i32 0 + } ... --- name: test_branch @@ -936,3 +948,110 @@ body: | ; CHECK: %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags ... +--- +name: test_existing_setcc +# CHECK-LABEL: name: test_existing_setcc +liveins: + - { reg: '$rdi', virtual-reg: '%0' } + - { reg: '$rsi', virtual-reg: '%1' } +body: | + bb.0: + successors: %bb.1, %bb.2, %bb.3 + liveins: $rdi, $rsi + + %0:gr64 = COPY $rdi + %1:gr64 = COPY $rsi + CMP64rr %0, %1, implicit-def $eflags + %2:gr8 = SETAr implicit $eflags + %3:gr8 = SETAEr implicit $eflags + %4:gr64 = COPY $eflags + ; CHECK: CMP64rr %0, %1, implicit-def $eflags + ; CHECK-NEXT: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags + ; CHECK-NEXT: %[[AE_REG:[^:]*]]:gr8 = SETAEr implicit $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + $eflags = COPY %4 + JA_1 %bb.1, implicit $eflags + JB_1 %bb.2, implicit $eflags + JMP_1 %bb.3 + ; CHECK-NOT: $eflags = + ; + ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags + ; CHECK-NEXT: JNE_1 %bb.1, implicit killed $eflags + ; CHECK-SAME: {{$[[:space:]]}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: {{.*$}} + ; CHECK-SAME: {{$[[:space:]]}} + ; CHECK-NEXT: TEST8rr %[[AE_REG]], %[[AE_REG]], implicit-def $eflags + ; CHECK-NEXT: JE_1 %bb.2, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.3 + + bb.1: + %5:gr32 = MOV32ri64 42 + $eax = COPY %5 + RET 0, $eax + + bb.2: + %6:gr32 = MOV32ri64 43 + $eax = COPY %6 + RET 0, $eax + + bb.3: + %7:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY %7 + RET 0, $eax + +... +--- +name: test_existing_setcc_memory +# CHECK-LABEL: name: test_existing_setcc_memory +liveins: + - { reg: '$rdi', virtual-reg: '%0' } + - { reg: '$rsi', virtual-reg: '%1' } +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $rdi, $rsi + + %0:gr64 = COPY $rdi + %1:gr64 = COPY $rsi + CMP64rr %0, %1, implicit-def $eflags + SETEm %0, 1, $noreg, -16, $noreg, implicit $eflags + %2:gr64 = COPY $eflags + ; CHECK: CMP64rr %0, %1, implicit-def $eflags + ; We cannot reuse this SETE because it stores the flag directly to memory, + ; so we have two SETEs here. FIXME: It'd be great if something could fold + ; these automatically. If not, maybe we want to unfold SETcc instructions + ; writing to memory so we can reuse them. + ; CHECK-NEXT: SETEm {{.*}} implicit $eflags + ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + $eflags = COPY %2 + JE_1 %bb.1, implicit $eflags + JMP_1 %bb.2 + ; CHECK-NOT: $eflags = + ; + ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags + ; CHECK-NEXT: JNE_1 %bb.1, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.2 + + bb.1: + %3:gr32 = MOV32ri64 42 + $eax = COPY %3 + RET 0, $eax + + bb.2: + %4:gr32 = MOV32ri64 43 + $eax = COPY %4 + RET 0, $eax + +... diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll index b285a4ed5224..6899babf31de 100644 --- a/test/CodeGen/X86/lea-opt.ll +++ b/test/CodeGen/X86/lea-opt.ll @@ -307,3 +307,154 @@ sw.bb.2: ; preds = %entry sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry ret void } + +define i32 @test5(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addl %esi, %esi +; CHECK-NEXT: subl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 %y, -2 + %add = add nsw i32 %mul, %x + ret i32 %add +} + +define i32 @test6(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (%rsi,%rsi,2), %eax +; CHECK-NEXT: subl %eax, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 %y, -3 + %add = add nsw i32 %mul, %x + ret i32 %add +} + +define i32 @test7(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shll $2, %esi +; CHECK-NEXT: subl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 %y, -4 + %add = add nsw i32 %mul, %x + ret i32 %add +} + +define i32 @test8(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,4), %eax +; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = shl nsw i32 %y, 2 + %sub = sub nsw i32 %mul, %x + ret i32 %sub +} + + +define i32 @test9(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addl %esi, %esi +; CHECK-NEXT: subl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 -2, %y + %add = add nsw i32 %x, %mul + ret i32 %add +} + +define i32 @test10(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (%rsi,%rsi,2), %eax +; CHECK-NEXT: subl %eax, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 -3, %y + %add = add nsw i32 %x, %mul + ret i32 %add +} + +define i32 @test11(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shll $2, %esi +; CHECK-NEXT: subl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 -4, %y + %add = add nsw i32 %x, %mul + ret i32 %add +} + +define i32 @test12(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,4), %eax +; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 4, %y + %sub = sub nsw i32 %mul, %x + ret i32 %sub +} + +define i64 @test13(i64 %x, i64 %y) #0 { +; CHECK-LABEL: test13: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shlq $2, %rsi +; CHECK-NEXT: subq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i64 -4, %y + %add = add nsw i64 %x, %mul + ret i64 %add +} + +define i32 @test14(i32 %x, i32 %y) #0 { +; CHECK-LABEL: test14: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,4), %eax +; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: retq +entry: + %mul = mul nsw i32 4, %y + %sub = sub nsw i32 %mul, %x + ret i32 %sub +} + +define zeroext i16 @test15(i16 zeroext %x, i16 zeroext %y) #0 { +; CHECK-LABEL: test15: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shll $3, %esi +; CHECK-NEXT: subl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq +entry: + %conv = zext i16 %x to i32 + %conv1 = zext i16 %y to i32 + %mul = mul nsw i32 -8, %conv1 + %add = add nsw i32 %conv, %mul + %conv2 = trunc i32 %add to i16 + ret i16 %conv2 +} + +attributes #0 = { norecurse nounwind optsize readnone uwtable} diff --git a/test/CodeGen/X86/machine-outliner-tailcalls.ll b/test/CodeGen/X86/machine-outliner-tailcalls.ll index 6f28354c386b..71ebade623cf 100644 --- a/test/CodeGen/X86/machine-outliner-tailcalls.ll +++ b/test/CodeGen/X86/machine-outliner-tailcalls.ll @@ -1,4 +1,4 @@ -; RUN: llc -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s @x = common local_unnamed_addr global i32 0, align 4 diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll index 737bcc7c864d..cf367ecbb98e 100644 --- a/test/CodeGen/X86/mul-constant-i16.ll +++ b/test/CodeGen/X86/mul-constant-i16.ll @@ -766,6 +766,50 @@ define i16 @test_mul_by_520(i16 %x) { ret i16 %mul } +define i16 @test_mul_by_neg10(i16 %x) { +; X86-LABEL: test_mul_by_neg10: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %eax, %eax +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: negl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: test_mul_by_neg10: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: negl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %mul = mul nsw i16 %x, -10 + ret i16 %mul +} + +define i16 @test_mul_by_neg36(i16 %x) { +; X86-LABEL: test_mul_by_neg36: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $2, %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: negl %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: test_mul_by_neg36: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: negl %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %mul = mul nsw i16 %x, -36 + ret i16 %mul +} + ; (x*9+42)*(x*5+2) define i16 @test_mul_spec(i16 %x) nounwind { ; X86-LABEL: test_mul_spec: diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll index 356d5a00abf6..04f867bb4e16 100644 --- a/test/CodeGen/X86/mul-constant-i32.ll +++ b/test/CodeGen/X86/mul-constant-i32.ll @@ -1997,6 +1997,118 @@ define i32 @test_mul_by_520(i32 %x) { ret i32 %mul } +define i32 @test_mul_by_neg10(i32 %x) { +; X86-LABEL: test_mul_by_neg10: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %eax, %eax +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: negl %eax +; X86-NEXT: retl +; +; X64-HSW-LABEL: test_mul_by_neg10: +; X64-HSW: # %bb.0: +; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi +; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] +; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] +; X64-HSW-NEXT: negl %eax # sched: [1:0.25] +; X64-HSW-NEXT: retq # sched: [7:1.00] +; +; X64-JAG-LABEL: test_mul_by_neg10: +; X64-JAG: # %bb.0: +; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi +; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00] +; X64-JAG-NEXT: negl %eax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] +; +; X86-NOOPT-LABEL: test_mul_by_neg10: +; X86-NOOPT: # %bb.0: +; X86-NOOPT-NEXT: imull $-10, {{[0-9]+}}(%esp), %eax +; X86-NOOPT-NEXT: retl +; +; HSW-NOOPT-LABEL: test_mul_by_neg10: +; HSW-NOOPT: # %bb.0: +; HSW-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [7:1.00] +; +; JAG-NOOPT-LABEL: test_mul_by_neg10: +; JAG-NOOPT: # %bb.0: +; JAG-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00] +; JAG-NOOPT-NEXT: retq # sched: [4:1.00] +; +; X64-SLM-LABEL: test_mul_by_neg10: +; X64-SLM: # %bb.0: +; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] +; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] +; X64-SLM-NEXT: negl %eax # sched: [1:0.50] +; X64-SLM-NEXT: retq # sched: [4:1.00] +; +; SLM-NOOPT-LABEL: test_mul_by_neg10: +; SLM-NOOPT: # %bb.0: +; SLM-NOOPT-NEXT: imull $-10, %edi, %eax # sched: [3:1.00] +; SLM-NOOPT-NEXT: retq # sched: [4:1.00] + %mul = mul nsw i32 %x, -10 + ret i32 %mul +} + +define i32 @test_mul_by_neg36(i32 %x) { +; X86-LABEL: test_mul_by_neg36: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $2, %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: negl %eax +; X86-NEXT: retl +; +; X64-HSW-LABEL: test_mul_by_neg36: +; X64-HSW: # %bb.0: +; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi +; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] +; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; X64-HSW-NEXT: negl %eax # sched: [1:0.25] +; X64-HSW-NEXT: retq # sched: [7:1.00] +; +; X64-JAG-LABEL: test_mul_by_neg36: +; X64-JAG: # %bb.0: +; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi +; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00] +; X64-JAG-NEXT: negl %eax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] +; +; X86-NOOPT-LABEL: test_mul_by_neg36: +; X86-NOOPT: # %bb.0: +; X86-NOOPT-NEXT: imull $-36, {{[0-9]+}}(%esp), %eax +; X86-NOOPT-NEXT: retl +; +; HSW-NOOPT-LABEL: test_mul_by_neg36: +; HSW-NOOPT: # %bb.0: +; HSW-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [7:1.00] +; +; JAG-NOOPT-LABEL: test_mul_by_neg36: +; JAG-NOOPT: # %bb.0: +; JAG-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00] +; JAG-NOOPT-NEXT: retq # sched: [4:1.00] +; +; X64-SLM-LABEL: test_mul_by_neg36: +; X64-SLM: # %bb.0: +; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] +; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] +; X64-SLM-NEXT: negl %eax # sched: [1:0.50] +; X64-SLM-NEXT: retq # sched: [4:1.00] +; +; SLM-NOOPT-LABEL: test_mul_by_neg36: +; SLM-NOOPT: # %bb.0: +; SLM-NOOPT-NEXT: imull $-36, %edi, %eax # sched: [3:1.00] +; SLM-NOOPT-NEXT: retq # sched: [4:1.00] + %mul = mul nsw i32 %x, -36 + ret i32 %mul +} + ; (x*9+42)*(x*5+2) define i32 @test_mul_spec(i32 %x) nounwind { ; X86-LABEL: test_mul_spec: diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll index 332ad7f01299..761ca67ab31b 100644 --- a/test/CodeGen/X86/mul-constant-i64.ll +++ b/test/CodeGen/X86/mul-constant-i64.ll @@ -2107,6 +2107,144 @@ define i64 @test_mul_by_520(i64 %x) { ret i64 %mul } +define i64 @test_mul_by_neg10(i64 %x) { +; X86-LABEL: test_mul_by_neg10: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $-10, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: leal (%esi,%esi,4), %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-HSW-LABEL: test_mul_by_neg10: +; X64-HSW: # %bb.0: +; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] +; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] +; X64-HSW-NEXT: negq %rax # sched: [1:0.25] +; X64-HSW-NEXT: retq # sched: [7:1.00] +; +; X64-JAG-LABEL: test_mul_by_neg10: +; X64-JAG: # %bb.0: +; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [2:1.00] +; X64-JAG-NEXT: negq %rax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] +; +; X86-NOOPT-LABEL: test_mul_by_neg10: +; X86-NOOPT: # %bb.0: +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOOPT-NEXT: movl $-10, %edx +; X86-NOOPT-NEXT: movl %ecx, %eax +; X86-NOOPT-NEXT: mull %edx +; X86-NOOPT-NEXT: subl %ecx, %edx +; X86-NOOPT-NEXT: imull $-10, {{[0-9]+}}(%esp), %ecx +; X86-NOOPT-NEXT: addl %ecx, %edx +; X86-NOOPT-NEXT: retl +; +; HSW-NOOPT-LABEL: test_mul_by_neg10: +; HSW-NOOPT: # %bb.0: +; HSW-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [7:1.00] +; +; JAG-NOOPT-LABEL: test_mul_by_neg10: +; JAG-NOOPT: # %bb.0: +; JAG-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [6:4.00] +; JAG-NOOPT-NEXT: retq # sched: [4:1.00] +; +; X64-SLM-LABEL: test_mul_by_neg10: +; X64-SLM: # %bb.0: +; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] +; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] +; X64-SLM-NEXT: negq %rax # sched: [1:0.50] +; X64-SLM-NEXT: retq # sched: [4:1.00] +; +; SLM-NOOPT-LABEL: test_mul_by_neg10: +; SLM-NOOPT: # %bb.0: +; SLM-NOOPT-NEXT: imulq $-10, %rdi, %rax # sched: [3:1.00] +; SLM-NOOPT-NEXT: retq # sched: [4:1.00] + %mul = mul nsw i64 %x, -10 + ret i64 %mul +} + +define i64 @test_mul_by_neg36(i64 %x) { +; X86-LABEL: test_mul_by_neg36: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $-36, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: leal (%esi,%esi,8), %ecx +; X86-NEXT: shll $2, %ecx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-HSW-LABEL: test_mul_by_neg36: +; X64-HSW: # %bb.0: +; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] +; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; X64-HSW-NEXT: negq %rax # sched: [1:0.25] +; X64-HSW-NEXT: retq # sched: [7:1.00] +; +; X64-JAG-LABEL: test_mul_by_neg36: +; X64-JAG: # %bb.0: +; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [2:1.00] +; X64-JAG-NEXT: negq %rax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] +; +; X86-NOOPT-LABEL: test_mul_by_neg36: +; X86-NOOPT: # %bb.0: +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOOPT-NEXT: movl $-36, %edx +; X86-NOOPT-NEXT: movl %ecx, %eax +; X86-NOOPT-NEXT: mull %edx +; X86-NOOPT-NEXT: subl %ecx, %edx +; X86-NOOPT-NEXT: imull $-36, {{[0-9]+}}(%esp), %ecx +; X86-NOOPT-NEXT: addl %ecx, %edx +; X86-NOOPT-NEXT: retl +; +; HSW-NOOPT-LABEL: test_mul_by_neg36: +; HSW-NOOPT: # %bb.0: +; HSW-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [7:1.00] +; +; JAG-NOOPT-LABEL: test_mul_by_neg36: +; JAG-NOOPT: # %bb.0: +; JAG-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [6:4.00] +; JAG-NOOPT-NEXT: retq # sched: [4:1.00] +; +; X64-SLM-LABEL: test_mul_by_neg36: +; X64-SLM: # %bb.0: +; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] +; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] +; X64-SLM-NEXT: negq %rax # sched: [1:0.50] +; X64-SLM-NEXT: retq # sched: [4:1.00] +; +; SLM-NOOPT-LABEL: test_mul_by_neg36: +; SLM-NOOPT: # %bb.0: +; SLM-NOOPT-NEXT: imulq $-36, %rdi, %rax # sched: [3:1.00] +; SLM-NOOPT-NEXT: retq # sched: [4:1.00] + %mul = mul nsw i64 %x, -36 + ret i64 %mul +} + ; (x*9+42)*(x*5+2) define i64 @test_mul_spec(i64 %x) nounwind { ; X86-LABEL: test_mul_spec: diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll index 96ee97341749..6031bafb0972 100644 --- a/test/CodeGen/X86/pku.ll +++ b/test/CodeGen/X86/pku.ll @@ -26,17 +26,11 @@ define void @test_x86_wrpkru(i32 %src) { } define i32 @test_x86_rdpkru() { -; X86-LABEL: test_x86_rdpkru: -; X86: ## %bb.0: -; X86-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9] -; X86-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_rdpkru: -; X64: ## %bb.0: -; X64-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9] -; X64-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_rdpkru: +; CHECK: ## %bb.0: +; CHECK-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9] +; CHECK-NEXT: rdpkru ## encoding: [0x0f,0x01,0xee] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.rdpkru() ret i32 %res } diff --git a/test/CodeGen/X86/pmaddubsw.ll b/test/CodeGen/X86/pmaddubsw.ll new file mode 100644 index 000000000000..d44315af2c6b --- /dev/null +++ b/test/CodeGen/X86/pmaddubsw.ll @@ -0,0 +1,553 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW + +; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. +; This would require the combine to recreate the concat_vectors. +define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_128: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: pmaddubsw_128: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %A_even_ext = sext <8 x i8> %A_even to <8 x i32> + %B_even_ext = zext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_256: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: pmaddubsw_256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_256: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 +; AVX256-NEXT: retq + %A = load <32 x i8>, <32 x i8>* %Aptr + %B = load <32 x i8>, <32 x i8>* %Bptr + %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %A_even_ext = sext <16 x i8> %A_even to <16 x i32> + %B_even_ext = zext <16 x i8> %B_even to <16 x i32> + %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32> + %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32> + %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext + %add = add <16 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <16 x i32> %min to <16 x i16> + ret <16 x i16> %trunc +} + +define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_512: +; SSE: # %bb.0: +; SSE-NEXT: movdqa 112(%rdx), %xmm0 +; SSE-NEXT: movdqa 96(%rdx), %xmm1 +; SSE-NEXT: movdqa 80(%rdx), %xmm2 +; SSE-NEXT: movdqa 64(%rdx), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: movdqa 32(%rdx), %xmm6 +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: pmaddubsw (%rsi), %xmm4 +; SSE-NEXT: pmaddubsw 16(%rsi), %xmm5 +; SSE-NEXT: pmaddubsw 32(%rsi), %xmm6 +; SSE-NEXT: pmaddubsw 48(%rsi), %xmm7 +; SSE-NEXT: pmaddubsw 64(%rsi), %xmm3 +; SSE-NEXT: pmaddubsw 80(%rsi), %xmm2 +; SSE-NEXT: pmaddubsw 96(%rsi), %xmm1 +; SSE-NEXT: pmaddubsw 112(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 112(%rdi) +; SSE-NEXT: movdqa %xmm1, 96(%rdi) +; SSE-NEXT: movdqa %xmm2, 80(%rdi) +; SSE-NEXT: movdqa %xmm3, 64(%rdi) +; SSE-NEXT: movdqa %xmm7, 48(%rdi) +; SSE-NEXT: movdqa %xmm6, 32(%rdi) +; SSE-NEXT: movdqa %xmm5, 16(%rdi) +; SSE-NEXT: movdqa %xmm4, (%rdi) +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: pmaddubsw_512: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX1-NEXT: vmovdqa (%rsi), %ymm4 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm9, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: retq +; +; AVX2-LABEL: pmaddubsw_512: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 +; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: pmaddubsw_512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: pmaddubsw_512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: retq + %A = load <128 x i8>, <128 x i8>* %Aptr + %B = load <128 x i8>, <128 x i8>* %Bptr + %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> + %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> + %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> + %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> + %A_even_ext = sext <64 x i8> %A_even to <64 x i32> + %B_even_ext = zext <64 x i8> %B_even to <64 x i32> + %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32> + %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32> + %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext + %add = add <64 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <64 x i32> %min to <64 x i16> + ret <64 x i16> %trunc +} + +define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_swapped_indices: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: pmaddubsw_swapped_indices: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A + %A_even_ext = sext <8 x i8> %A_even to <8 x i32> + %B_even_ext = zext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_swapped_extend: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: pmaddubsw (%rsi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: pmaddubsw_swapped_extend: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %A_even_ext = zext <8 x i8> %A_even to <8 x i32> + %B_even_ext = sext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_commuted_mul: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: pmaddubsw_commuted_mul: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %A_even_ext = sext <8 x i8> %A_even to <8 x i32> + %B_even_ext = zext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_bad_extend: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psllw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pmulhw %xmm2, %xmm4 +; SSE-NEXT: pmullw %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pmulhw %xmm0, %xmm4 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: pmaddubsw_bad_extend: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: pmaddubsw_bad_extend: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddubsw_bad_extend: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %A_even_ext = sext <8 x i8> %A_even to <8 x i32> + %B_even_ext = zext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { +; SSE-LABEL: pmaddubsw_bad_indices: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pmulhw %xmm2, %xmm4 +; SSE-NEXT: pmullw %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pmulhw %xmm0, %xmm4 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: pmaddubsw_bad_indices: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: pmaddubsw_bad_indices: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddubsw_bad_indices: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %A = load <16 x i8>, <16 x i8>* %Aptr + %B = load <16 x i8>, <16 x i8>* %Bptr + %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even + %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd + %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A + %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A + %A_even_ext = sext <8 x i8> %A_even to <8 x i32> + %B_even_ext = zext <8 x i8> %B_even to <8 x i32> + %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> + %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> + %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext + %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext + %add = add <8 x i32> %even_mul, %odd_mul + %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> + %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> + %trunc = trunc <8 x i32> %min to <8 x i16> + ret <8 x i16> %trunc +} diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll index 672baa5c1bdc..5f2cc199bcf4 100644 --- a/test/CodeGen/X86/rem.ll +++ b/test/CodeGen/X86/rem.ll @@ -15,8 +15,8 @@ define i32 @test1(i32 %X) { ; CHECK-NEXT: addl %eax, %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: subl %edx, %eax -; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: subl %eax, %edx +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %tmp1 = srem i32 %X, 255 @@ -48,8 +48,8 @@ define i32 @test3(i32 %X) { ; CHECK-NEXT: shrl $7, %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: subl %edx, %eax -; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: subl %eax, %edx +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %tmp1 = urem i32 %X, 255 diff --git a/test/CodeGen/X86/rotate-extract-vector.ll b/test/CodeGen/X86/rotate-extract-vector.ll index 6059a76259ba..e2679dded8b5 100644 --- a/test/CodeGen/X86/rotate-extract-vector.ll +++ b/test/CodeGen/X86/rotate-extract-vector.ll @@ -12,10 +12,10 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { ; CHECK-LABEL: vroll_v4i32_extract_shl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 -; CHECK-NEXT: vpslld $10, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 +; CHECK-NEXT: vprold $7, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10> @@ -25,20 +25,12 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { } define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { -; X86-LABEL: vrolq_v4i64_extract_shrl: -; X86: # %bb.0: -; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vprolq $24, %zmm0, %zmm0 -; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 -; X86-NEXT: retl -; -; X64-LABEL: vrolq_v4i64_extract_shrl: -; X64: # %bb.0: -; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vprolq $24, %zmm0, %zmm0 -; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919] -; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-NEXT: retq +; CHECK-LABEL: vrolq_v4i64_extract_shrl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 +; CHECK-NEXT: vprolq $29, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: ret{{[l|q]}} %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40> %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5> %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29> @@ -49,12 +41,10 @@ define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { ; CHECK-LABEL: vroll_extract_mul: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640] -; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10] -; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] +; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vprold $6, %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640> %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> @@ -66,7 +56,7 @@ define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $44, %esp ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: vmovss %xmm0, (%esp) @@ -85,53 +75,27 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180 -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: vpsllq $57, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: addl $60, %esp +; X86-NEXT: vprolq $57, %zmm0, %zmm0 +; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; X86-NEXT: addl $44, %esp +; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: vrolq_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: vmovq %rax, %xmm1 +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm1 ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: vmovq %rax, %xmm0 +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: vmovq %rdx, %xmm0 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: shrq $8, %rcx -; X64-NEXT: vmovq %rcx, %xmm1 -; X64-NEXT: shrq $8, %rdx -; X64-NEXT: vmovq %rdx, %xmm2 -; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X64-NEXT: vpsllq $57, %xmm0, %xmm0 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vprolq $57, %zmm0, %zmm0 +; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; X64-NEXT: vzeroupper ; X64-NEXT: retq %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384> @@ -141,17 +105,23 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { } define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { -; CHECK-LABEL: vrolw_extract_mul_with_mask: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152] -; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9] -; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vrolw_extract_mul_with_mask: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X86-NEXT: vprold $7, %zmm0, %zmm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: vrolw_extract_mul_with_mask: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] +; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vprold $7, %zmm0, %zmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152> %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160> diff --git a/test/CodeGen/X86/rotate-extract.ll b/test/CodeGen/X86/rotate-extract.ll index 6ce3db13e954..a1babd1d3cc3 100644 --- a/test/CodeGen/X86/rotate-extract.ll +++ b/test/CodeGen/X86/rotate-extract.ll @@ -24,9 +24,7 @@ define i64 @rolq_extract_shl(i64 %i) nounwind { ; X64-LABEL: rolq_extract_shl: ; X64: # %bb.0: ; X64-NEXT: leaq (,%rdi,8), %rax -; X64-NEXT: shlq $10, %rdi -; X64-NEXT: shrq $57, %rax -; X64-NEXT: orq %rdi, %rax +; X64-NEXT: rolq $7, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 3 %rhs_mul = shl i64 %i, 10 @@ -39,16 +37,17 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind { ; X86-LABEL: rolw_extract_shrl: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: rolw $9, %ax -; X86-NEXT: andl $61951, %eax # imm = 0xF1FF +; X86-NEXT: shrl $3, %eax +; X86-NEXT: rolw $12, %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolw_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: rolw $9, %di -; X64-NEXT: andl $61951, %edi # imm = 0xF1FF -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: rolw $12, %ax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %lhs_div = lshr i16 %i, 7 %rhs_div = lshr i16 %i, 3 @@ -60,22 +59,16 @@ define i16 @rolw_extract_shrl(i16 %i) nounwind { define i32 @roll_extract_mul(i32 %i) nounwind { ; X86-LABEL: roll_extract_mul: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: shll $7, %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %ecx -; X86-NEXT: shrl $25, %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: roll $7, %eax ; X86-NEXT: retl ; ; X64-LABEL: roll_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: shll $7, %edi -; X64-NEXT: leal (%rdi,%rdi,8), %ecx -; X64-NEXT: shrl $25, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: roll $7, %eax ; X64-NEXT: retq %lhs_mul = mul i32 %i, 9 %rhs_mul = mul i32 %i, 1152 @@ -89,11 +82,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull $171, %eax, %eax -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shlb $3, %cl -; X86-NEXT: andb $-16, %cl -; X86-NEXT: shrl $13, %eax -; X86-NEXT: orb %cl, %al +; X86-NEXT: shrl $9, %eax +; X86-NEXT: rolb $4, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -101,12 +91,8 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $171, %eax, %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $8, %ecx -; X64-NEXT: shlb $3, %cl -; X64-NEXT: andb $-16, %cl -; X64-NEXT: shrl $13, %eax -; X64-NEXT: orb %cl, %al +; X64-NEXT: shrl $9, %eax +; X64-NEXT: rolb $4, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 @@ -139,12 +125,8 @@ define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind { ; X64-LABEL: rolq_extract_mul_with_mask: ; X64: # %bb.0: ; X64-NEXT: leaq (%rdi,%rdi,8), %rax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi -; X64-NEXT: shll $7, %edi -; X64-NEXT: leal (%rdi,%rdi,8), %ecx -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: shrq $57, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: rolq $7, %rax +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 diff --git a/test/CodeGen/X86/signbit-shift.ll b/test/CodeGen/X86/signbit-shift.ll index cee647931bcb..1579a77a2e9b 100644 --- a/test/CodeGen/X86/signbit-shift.ll +++ b/test/CodeGen/X86/signbit-shift.ll @@ -156,9 +156,9 @@ define i32 @sext_ifneg(i32 %x) { define i32 @add_sext_ifneg(i32 %x) { ; CHECK-LABEL: add_sext_ifneg: ; CHECK: # %bb.0: -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: movl $42, %eax -; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal 42(%rdi), %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 0 %e = sext i1 %c to i32 @@ -169,9 +169,9 @@ define i32 @add_sext_ifneg(i32 %x) { define i32 @sel_ifneg_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_fval_bigger: ; CHECK: # %bb.0: -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: movl $42, %eax -; CHECK-NEXT: subl %edi, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal 42(%rdi), %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 0 %r = select i1 %c, i32 41, i32 42 @@ -231,9 +231,10 @@ define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) { define i32 @sub_lshr(i32 %x, i32 %y) { ; CHECK-LABEL: sub_lshr: ; CHECK: # %bb.0: -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: subl %edi, %esi -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal (%rdi,%rsi), %eax ; CHECK-NEXT: retq %sh = lshr i32 %x, 31 %r = sub i32 %y, %sh @@ -243,9 +244,8 @@ define i32 @sub_lshr(i32 %x, i32 %y) { define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: sub_lshr_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: psrld $31, %xmm0 -; CHECK-NEXT: psubd %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: retq %sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> %r = sub <4 x i32> %y, %sh @@ -255,9 +255,9 @@ define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) { define i32 @sub_const_op_lshr(i32 %x) { ; CHECK-LABEL: sub_const_op_lshr: ; CHECK: # %bb.0: -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: xorl $43, %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal 43(%rdi), %eax ; CHECK-NEXT: retq %sh = lshr i32 %x, 31 %r = sub i32 43, %sh @@ -267,10 +267,8 @@ define i32 @sub_const_op_lshr(i32 %x) { define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) { ; CHECK-LABEL: sub_const_op_lshr_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: psrld $31, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] -; CHECK-NEXT: psubd %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq %sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> %r = sub <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %sh diff --git a/test/CodeGen/X86/speculative-load-hardening.ll b/test/CodeGen/X86/speculative-load-hardening.ll index 45b9c2f29807..55f7949c0da0 100644 --- a/test/CodeGen/X86/speculative-load-hardening.ll +++ b/test/CodeGen/X86/speculative-load-hardening.ll @@ -8,7 +8,7 @@ declare void @leak(i32 %v1, i32 %v2) declare void @sink(i32) -define i32 @test_trivial_entry_load(i32* %ptr) nounwind { +define i32 @test_trivial_entry_load(i32* %ptr) { ; X64-LABEL: test_trivial_entry_load: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rsp, %rcx @@ -29,12 +29,18 @@ entry: ret i32 %v } -define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) nounwind { +define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) { ; X64-LABEL: test_basic_conditions: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %r15 +; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 +; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 +; X64-NEXT: .cfi_offset %r14, -24 +; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rbx ; X64-NEXT: sarq $63, %rax @@ -50,10 +56,14 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 +; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: popq %r15 +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq ; X64-NEXT: .LBB1_4: # %then2 +; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: movq %r8, %r15 ; X64-NEXT: cmovneq %rbx, %rax ; X64-NEXT: testl %edx, %edx @@ -90,19 +100,21 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr ; ; X64-LFENCE-LABEL: test_basic_conditions: ; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24 +; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32 +; X64-LFENCE-NEXT: .cfi_offset %rbx, -24 +; X64-LFENCE-NEXT: .cfi_offset %r14, -16 ; X64-LFENCE-NEXT: testl %edi, %edi ; X64-LFENCE-NEXT: jne .LBB1_6 ; X64-LFENCE-NEXT: # %bb.1: # %then1 ; X64-LFENCE-NEXT: lfence ; X64-LFENCE-NEXT: testl %esi, %esi -; X64-LFENCE-NEXT: je .LBB1_2 -; X64-LFENCE-NEXT: .LBB1_6: # %exit -; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: retq -; X64-LFENCE-NEXT: .LBB1_2: # %then2 -; X64-LFENCE-NEXT: pushq %r14 -; X64-LFENCE-NEXT: pushq %rbx -; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: jne .LBB1_6 +; X64-LFENCE-NEXT: # %bb.2: # %then2 ; X64-LFENCE-NEXT: movq %r8, %rbx ; X64-LFENCE-NEXT: lfence ; X64-LFENCE-NEXT: testl %edx, %edx @@ -126,10 +138,14 @@ define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr ; X64-LFENCE-NEXT: .LBB1_5: # %merge ; X64-LFENCE-NEXT: movslq (%r14), %rax ; X64-LFENCE-NEXT: movl $0, (%rbx,%rax,4) +; X64-LFENCE-NEXT: .LBB1_6: # %exit +; X64-LFENCE-NEXT: lfence ; X64-LFENCE-NEXT: addq $8, %rsp +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24 ; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16 ; X64-LFENCE-NEXT: popq %r14 -; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 8 ; X64-LFENCE-NEXT: retq entry: %a.cmp = icmp eq i32 %a, 0 @@ -465,12 +481,18 @@ declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr -define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { ; X64-LABEL: test_basic_eh: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rbp +; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 +; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 +; X64-NEXT: .cfi_offset %r14, -24 +; X64-NEXT: .cfi_offset %rbp, -16 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax @@ -507,10 +529,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 +; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: popq %rbp +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq ; X64-NEXT: .LBB4_4: # %lpad +; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: .Ltmp2: ; X64-NEXT: movq %rsp, %rcx ; X64-NEXT: sarq $63, %rcx @@ -529,8 +555,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality ; X64-LFENCE-LABEL: test_basic_eh: ; X64-LFENCE: # %bb.0: # %entry ; X64-LFENCE-NEXT: pushq %rbp +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16 ; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24 ; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32 +; X64-LFENCE-NEXT: .cfi_offset %rbx, -32 +; X64-LFENCE-NEXT: .cfi_offset %r14, -24 +; X64-LFENCE-NEXT: .cfi_offset %rbp, -16 ; X64-LFENCE-NEXT: cmpl $41, %edi ; X64-LFENCE-NEXT: jg .LBB4_2 ; X64-LFENCE-NEXT: # %bb.1: # %thrower @@ -551,10 +583,14 @@ define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality ; X64-LFENCE-NEXT: .LBB4_2: # %exit ; X64-LFENCE-NEXT: lfence ; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 24 ; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 16 ; X64-LFENCE-NEXT: popq %rbp +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 8 ; X64-LFENCE-NEXT: retq ; X64-LFENCE-NEXT: .LBB4_3: # %lpad +; X64-LFENCE-NEXT: .cfi_def_cfa_offset 32 ; X64-LFENCE-NEXT: .Ltmp2: ; X64-LFENCE-NEXT: movl (%rax), %eax ; X64-LFENCE-NEXT: addl (%rbx), %eax diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 2416a177228e..3f251dd8d62c 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -301,9 +301,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: subq %rax, %rdx +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax @@ -313,9 +313,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: sarq %rdx ; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: subq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: subq %rax, %rdx +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -331,9 +331,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: sarq %rdx ; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: subq %rax, %rdx +; SSE41-NEXT: addq %rcx, %rdx +; SSE41-NEXT: movq %rdx, %xmm1 ; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: imulq %rsi @@ -342,9 +342,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: sarq %rdx ; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: subq %rax, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: subq %rax, %rdx +; SSE41-NEXT: addq %rcx, %rdx +; SSE41-NEXT: movq %rdx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -359,9 +359,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: imulq %rsi @@ -370,9 +370,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %res = srem <2 x i64> %a, <i64 7, i64 7> diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll index c112e84fbf73..5df4d09e9715 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -263,9 +263,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: imulq %rsi @@ -274,9 +274,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax @@ -286,9 +286,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: imulq %rsi @@ -297,9 +297,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -316,9 +316,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: imulq %rsi @@ -327,9 +327,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rax @@ -339,9 +339,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: imulq %rsi @@ -350,9 +350,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 310e1fc7057a..893c7d1bbd7b 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -214,9 +214,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vmovq %xmm1, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: imulq %rsi @@ -225,9 +225,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx @@ -238,9 +238,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: imulq %rsi @@ -249,9 +249,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -263,9 +263,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: imulq %rsi @@ -274,9 +274,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax @@ -286,9 +286,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: imulq %rsi @@ -297,9 +297,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: subq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %rdx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index c991a905c054..598782ddd639 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -278,9 +278,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: shrq $2, %rax ; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax @@ -291,9 +291,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: shrq $2, %rax ; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -310,9 +310,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: addq %rdx, %rax ; SSE41-NEXT: shrq $2, %rax ; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: subq %rdx, %rax +; SSE41-NEXT: addq %rcx, %rax +; SSE41-NEXT: movq %rax, %xmm1 ; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rsi @@ -322,9 +322,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE41-NEXT: addq %rdx, %rax ; SSE41-NEXT: shrq $2, %rax ; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: subq %rdx, %rax +; SSE41-NEXT: addq %rcx, %rax +; SSE41-NEXT: movq %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -340,9 +340,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi @@ -352,9 +352,9 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %res = urem <2 x i64> %a, <i64 7, i64 7> diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll index 81d93984e261..377ff5ea77af 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -264,9 +264,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: addq %rdx, %rax ; AVX1-NEXT: shrq $2, %rax ; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: mulq %rsi @@ -276,9 +276,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: addq %rdx, %rax ; AVX1-NEXT: shrq $2, %rax ; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax @@ -289,9 +289,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: addq %rdx, %rax ; AVX1-NEXT: shrq $2, %rax ; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: mulq %rsi @@ -301,9 +301,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: addq %rdx, %rax ; AVX1-NEXT: shrq $2, %rax ; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -321,9 +321,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: shrq $2, %rax ; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: mulq %rsi @@ -333,9 +333,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: shrq $2, %rax ; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rax @@ -346,9 +346,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: shrq $2, %rax ; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rcx ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: mulq %rsi @@ -358,9 +358,9 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: shrq $2, %rax ; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll index 1288f5a5d5be..22c359cb7e98 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -218,9 +218,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm2 ; AVX-NEXT: vmovq %xmm1, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi @@ -230,9 +230,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx @@ -244,9 +244,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi @@ -256,9 +256,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -271,9 +271,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi @@ -283,9 +283,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax @@ -296,9 +296,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi @@ -308,9 +308,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: addq %rdx, %rax ; AVX-NEXT: shrq $2, %rax ; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 90a0c6f291b2..b50680ff56ee 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1008,36 +1008,16 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrlw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512> +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: constant_shift_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v8i16: ; XOP: # %bb.0: @@ -1046,11 +1026,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; AVX512DQ-LABEL: constant_shift_v8i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v8i16: @@ -1064,10 +1041,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; AVX512DQVL-LABEL: constant_shift_v8i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v8i16: diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index f0f0bb8a8819..3ca714d7f830 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1025,21 +1025,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: constant_shift_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v16i16: @@ -1102,21 +1092,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; ; X32-AVX1-LABEL: constant_shift_v16i16: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: constant_shift_v16i16: diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll index 8934535d6f52..24f2b2be4308 100644 --- a/test/CodeGen/X86/win_coreclr_chkstk.ll +++ b/test/CodeGen/X86/win_coreclr_chkstk.ll @@ -10,8 +10,6 @@ entry: ; WIN_X64-LABEL:main4k: ; WIN_X64: # %bb.0: ; WIN_X64: movl $4096, %eax -; WIN_X64: movq %rcx, 8(%rsp) -; WIN_X64: movq %rdx, 16(%rsp) ; WIN_X64: xorq %rcx, %rcx ; WIN_X64: movq %rsp, %rdx ; WIN_X64: subq %rax, %rdx @@ -27,8 +25,6 @@ entry: ; WIN_X64: cmpq %rcx, %rdx ; WIN_X64: jne .LBB0_2 ; WIN_X64:.LBB0_3: -; WIN_X64: movq 8(%rsp), %rcx -; WIN_X64: movq 16(%rsp), %rdx ; WIN_X64: subq %rax, %rsp ; WIN_X64: xorl %eax, %eax ; WIN_X64: addq $4096, %rsp @@ -45,7 +41,6 @@ entry: define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" { entry: ; WIN_X64-LABEL:main4k_frame: -; WIN_X64: movq %rcx, 16(%rsp) ; WIN_X64: movq %gs:16, %rcx ; LINUX-LABEL:main4k_frame: ; LINUX-NOT: movq %gs:16, %rcx @@ -58,7 +53,6 @@ entry: ; Case with INT args define i32 @main4k_intargs(i32 %x, i32 %y) nounwind { entry: -; WIN_X64: movq %rcx, 8(%rsp) ; WIN_X64: movq %gs:16, %rcx ; LINUX-NOT: movq %gs:16, %rcx ; LINUX: retq @@ -71,7 +65,6 @@ entry: ; Case with FP regs define i32 @main4k_fpargs(double %x, double %y) nounwind { entry: -; WIN_X64: movq %rcx, 8(%rsp) ; WIN_X64: movq %gs:16, %rcx ; LINUX-NOT: movq %gs:16, %rcx ; LINUX: retq diff --git a/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir b/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir new file mode 100644 index 000000000000..8da5f895063f --- /dev/null +++ b/test/CodeGen/X86/win_coreclr_chkstk_liveins.mir @@ -0,0 +1,24 @@ +# RUN: llc -verify-machineinstrs %s -run-pass prologepilog -mtriple=x86_64-pc-win32-coreclr -o - | FileCheck %s +... +--- +name: main4k +# CHECK-LABEL: name: main4k + +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 8 +stack: + - { id: 0, size: 4096, alignment: 1, stack-id: 0 } +body: | + bb.0.entry: + $eax = IMPLICIT_DEF + RET 0, killed $eax + + ; CHECK: bb.1.entry: + ; CHECK: liveins: $rdx + ; CHECK: bb.2.entry: + ; CHECK: liveins: $rcx, $rdx + ; CHECK: bb.3.entry: + ; CHECK: liveins: $rax +... |