diff options
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir | 87 | ||||
-rw-r--r-- | test/CodeGen/AArch64/combine-and-like.ll | 13 | ||||
-rwxr-xr-x | test/CodeGen/X86/avx512-schedule.ll | 8 | ||||
-rw-r--r-- | test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir | 2 | ||||
-rw-r--r-- | test/CodeGen/X86/combine-pmuldq.ll | 108 | ||||
-rw-r--r-- | test/CodeGen/X86/fdiv-combine.ll | 35 | ||||
-rw-r--r-- | test/CodeGen/X86/gather-addresses.ll | 162 | ||||
-rw-r--r-- | test/CodeGen/X86/masked_gather_scatter.ll | 8 | ||||
-rw-r--r-- | test/CodeGen/X86/setcc-combine.ll | 24 | ||||
-rw-r--r-- | test/CodeGen/X86/shrink_vmul.ll | 46 | ||||
-rw-r--r-- | test/CodeGen/X86/slow-pmulld.ll | 16 | ||||
-rw-r--r-- | test/CodeGen/X86/sse2-schedule.ll | 32 |
12 files changed, 413 insertions, 128 deletions
diff --git a/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir b/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir index 630b34028162..c9ff2cd0d514 100644 --- a/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ b/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -1,7 +1,7 @@ -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefix=UNPROFITABLE %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynosm1 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefix=PROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynosm1 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s # name: f1_2s registers: @@ -80,3 +80,82 @@ body: | # PROFITABLE-LABEL: name: f1_2d # PROFITABLE: %5:fpr128 = FNEGv2f64 %2 # PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1 +--- +name: f1_both_fmul_2s +registers: + - { id: 0, class: fpr64 } + - { id: 1, class: fpr64 } + - { id: 2, class: fpr64 } + - { id: 3, class: fpr64 } + - { id: 4, class: fpr64 } + - { id: 5, class: fpr64 } + - { id: 6, class: fpr64 } +body: | + bb.0.entry: + %3:fpr64 = COPY %q3 + %2:fpr64 = COPY %q2 + %1:fpr64 = COPY %q1 + %0:fpr64 = COPY %q0 + %4:fpr64 = FMULv2f32 %0, %1 + %5:fpr64 = FMULv2f32 %2, %3 + %6:fpr64 = FSUBv2f32 killed %4, %5 + %q0 = COPY %6 + RET_ReallyLR implicit %q0 + +... +# ALL-LABEL: name: f1_both_fmul_2s +# ALL: %4:fpr64 = FMULv2f32 %0, %1 +# ALL-NEXT: FMLSv2f32 killed %4, %2, %3 +--- +name: f1_both_fmul_4s +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } +body: | + bb.0.entry: + %3:fpr128 = COPY %q3 + %2:fpr128 = COPY %q2 + %1:fpr128 = COPY %q1 + %0:fpr128 = COPY %q0 + %4:fpr128 = FMULv4f32 %0, %1 + %5:fpr128 = FMULv4f32 %2, %3 + %6:fpr128 = FSUBv4f32 killed %4, %5 + %q0 = COPY %6 + RET_ReallyLR implicit %q0 + +... +# ALL-LABEL: name: f1_both_fmul_4s +# ALL: %4:fpr128 = FMULv4f32 %0, %1 +# ALL-NEXT: FMLSv4f32 killed %4, %2, %3 +--- +name: f1_both_fmul_2d +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } + - { id: 5, class: fpr128 } + - { id: 6, class: fpr128 } +body: | + bb.0.entry: + %3:fpr128 = COPY %q3 + %2:fpr128 = COPY %q2 + %1:fpr128 = COPY %q1 + %0:fpr128 = COPY %q0 + %4:fpr128 = FMULv2f64 %0, %1 + %5:fpr128 = FMULv2f64 %2, %3 + %6:fpr128 = FSUBv2f64 killed %4, %5 + %q0 = COPY %6 + RET_ReallyLR implicit %q0 + +... +# ALL-LABEL: name: f1_both_fmul_2d +# ALL: %4:fpr128 = FMULv2f64 %0, %1 +# ALL-NEXT: FMLSv2f64 killed %4, %2, %3 + diff --git a/test/CodeGen/AArch64/combine-and-like.ll b/test/CodeGen/AArch64/combine-and-like.ll new file mode 100644 index 000000000000..15770c2e02ff --- /dev/null +++ b/test/CodeGen/AArch64/combine-and-like.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +define i32 @f(i32 %a0) { +; CHECK-LABEL: f: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = lshr i32 %a0, 2147483647 + %2 = add i32 %1, 2147483647 + %3 = and i32 %2, %1 + ret i32 %3 +} diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 306b95f0f3ae..af99b86ca5d1 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -129,7 +129,7 @@ entry: define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { ; GENERIC-LABEL: imulq512: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq512: @@ -143,7 +143,7 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ; GENERIC-LABEL: imulq256: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq256: @@ -157,7 +157,7 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; GENERIC-LABEL: imulq128: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq128: @@ -550,7 +550,7 @@ define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { ; GENERIC-LABEL: vpmulld_test: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [3:1.00] +; GENERIC-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpmulld_test: diff --git a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir index 965014162073..bbefc4f920a1 100644 --- a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir +++ b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir @@ -57,7 +57,7 @@ # return w; # } # -# CHECK: 129: eb 13 jmp 19 <ifElse+0x7E> +# CHECK: 129: eb 13 jmp 19 <ifElse+0x7e> # CHECK: 12e: eb a0 jmp -96 <ifElse+0x10> # CHECK: 132: eb 9c jmp -100 <ifElse+0x10> # CHECK: 137: eb 97 jmp -105 <ifElse+0x10> diff --git a/test/CodeGen/X86/combine-pmuldq.ll b/test/CodeGen/X86/combine-pmuldq.ll index 53ab87a386b3..ebfe0d56358e 100644 --- a/test/CodeGen/X86/combine-pmuldq.ll +++ b/test/CodeGen/X86/combine-pmuldq.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512DQVL ; TODO - shuffle+sext are superfluous define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { @@ -66,13 +69,29 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_shuffle_zero_pmuludq: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: combine_shuffle_zero_pmuludq: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: combine_shuffle_zero_pmuludq: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> %2 = shufflevector <4 x i32> %a1, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 7> %3 = bitcast <4 x i32> %1 to <2 x i64> @@ -94,13 +113,29 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) ; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_shuffle_zero_pmuludq_256: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2-LABEL: combine_shuffle_zero_pmuludq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> %2 = shufflevector <8 x i32> %a1, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> %3 = bitcast <8 x i32> %1 to <4 x i64> @@ -108,3 +143,46 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) %5 = mul <4 x i64> %3, %4 ret <4 x i64> %5 } + +define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { +; SSE-LABEL: combine_zext_pmuludq_256: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [715827883,715827883] +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: combine_zext_pmuludq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [715827883,715827883,715827883,715827883] +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: combine_zext_pmuludq_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512VL-NEXT: vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: combine_zext_pmuludq_256: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512DQVL-NEXT: vpmuludq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512DQVL-NEXT: retq + %1 = zext <8 x i32> %a to <8 x i64> + %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883> + ret <8 x i64> %2 +} diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll index 912110e75d27..62e86e3ad2cc 100644 --- a/test/CodeGen/X86/fdiv-combine.ll +++ b/test/CodeGen/X86/fdiv-combine.ll @@ -95,6 +95,41 @@ define double @div3_arcp(double %x, double %y, double %z) { ret double %ret } +define float @div_select_constant_fold(i1 zeroext %arg) { +; CHECK-LABEL: div_select_constant_fold: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jne .LBB6_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB6_1: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq + %tmp = select i1 %arg, float 5.000000e+00, float 6.000000e+00 + %B2 = fdiv float %tmp, 1.000000e+00 + ret float %B2 +} + +define float @div_select_constant_fold_zero(i1 zeroext %arg) { +; CHECK-LABEL: div_select_constant_fold_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jne .LBB7_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: jmp .LBB7_3 +; CHECK-NEXT: .LBB7_1: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: divss %xmm1, %xmm0 +; CHECK-NEXT: retq + %tmp = select i1 %arg, float 5.000000e+00, float 6.000000e+00 + %B2 = fdiv float %tmp, 0.000000e+00 + ret float %B2 +} + define void @PR24141() { ; CHECK-LABEL: PR24141: ; CHECK: callq diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index e09ad3e4e0b8..6431847064f0 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN ; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32 @@ -7,34 +8,59 @@ ; use an efficient mov/shift sequence rather than shuffling each individual ; element out of the index vector. -; CHECK-LABEL: foo: -; LIN: movdqa (%rsi), %xmm0 -; LIN: pand (%rdx), %xmm0 -; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; LIN: movq %xmm0, %r[[REG2:.+]] -; LIN: movslq %e[[REG2]], %r[[REG1:.+]] -; LIN: sarq $32, %r[[REG2]] -; LIN: movslq %e[[REG4]], %r[[REG3:.+]] -; LIN: sarq $32, %r[[REG4]] -; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 -; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 -; LIN: movq %rdi, %xmm1 -; LIN: movq %r[[REG3]], %xmm0 - -; WIN: movdqa (%rdx), %xmm0 -; WIN: pand (%r8), %xmm0 -; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; WIN: movq %xmm0, %r[[REG2:.+]] -; WIN: movslq %e[[REG2]], %r[[REG1:.+]] -; WIN: sarq $32, %r[[REG2]] -; WIN: movslq %e[[REG4]], %r[[REG3:.+]] -; WIN: sarq $32, %r[[REG4]] -; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 -; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 -; WIN: movdqa (%r[[REG2]]), %xmm0 -; WIN: movq %r[[REG2]], %xmm1 - define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { +; LIN-LABEL: foo: +; LIN: # %bb.0: +; LIN-NEXT: movdqa (%rsi), %xmm0 +; LIN-NEXT: pand (%rdx), %xmm0 +; LIN-NEXT: pextrq $1, %xmm0, %rax +; LIN-NEXT: movq %xmm0, %rcx +; LIN-NEXT: movslq %ecx, %rdx +; LIN-NEXT: sarq $32, %rcx +; LIN-NEXT: movslq %eax, %rsi +; LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; LIN-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; LIN-NEXT: sarq $32, %rax +; LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; LIN-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; LIN-NEXT: retq +; +; WIN-LABEL: foo: +; WIN: # %bb.0: +; WIN-NEXT: movdqa (%rdx), %xmm0 +; WIN-NEXT: pand (%r8), %xmm0 +; WIN-NEXT: pextrq $1, %xmm0, %rax +; WIN-NEXT: movq %xmm0, %rdx +; WIN-NEXT: movslq %edx, %r8 +; WIN-NEXT: sarq $32, %rdx +; WIN-NEXT: movslq %eax, %r9 +; WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; WIN-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; WIN-NEXT: sarq $32, %rax +; WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; WIN-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; WIN-NEXT: retq +; +; LIN32-LABEL: foo: +; LIN32: # %bb.0: +; LIN32-NEXT: pushl %edi +; LIN32-NEXT: pushl %esi +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; LIN32-NEXT: movdqa (%edx), %xmm0 +; LIN32-NEXT: pand (%ecx), %xmm0 +; LIN32-NEXT: pextrd $1, %xmm0, %ecx +; LIN32-NEXT: pextrd $2, %xmm0, %edx +; LIN32-NEXT: pextrd $3, %xmm0, %esi +; LIN32-NEXT: movd %xmm0, %edi +; LIN32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; LIN32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; LIN32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; LIN32-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; LIN32-NEXT: popl %esi +; LIN32-NEXT: popl %edi +; LIN32-NEXT: retl %a = load <4 x i32>, <4 x i32>* %i %b = load <4 x i32>, <4 x i32>* %h %j = and <4 x i32> %a, %b @@ -60,13 +86,81 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; Check that the sequence previously used above, which bounces the vector off the ; cache works for x86-32. Note that in this case it will not be used for index ; calculation, since indexes are 32-bit, not 64. -; CHECK-LABEL: old: -; LIN32: movaps %xmm0, (%esp) -; LIN32-DAG: {{(mov|and)}}l (%esp), -; LIN32-DAG: {{(mov|and)}}l 4(%esp), -; LIN32-DAG: {{(mov|and)}}l 8(%esp), -; LIN32-DAG: {{(mov|and)}}l 12(%esp), define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind { +; LIN-LABEL: old: +; LIN: # %bb.0: +; LIN-NEXT: movdqa (%rsi), %xmm0 +; LIN-NEXT: pand (%rdx), %xmm0 +; LIN-NEXT: pextrq $1, %xmm0, %rax +; LIN-NEXT: movq %rax, %rdx +; LIN-NEXT: shrq $32, %rdx +; LIN-NEXT: movq %xmm0, %rsi +; LIN-NEXT: movq %rsi, %rdi +; LIN-NEXT: shrq $32, %rdi +; LIN-NEXT: andl %ecx, %esi +; LIN-NEXT: andl %ecx, %eax +; LIN-NEXT: andq %rcx, %rdi +; LIN-NEXT: andq %rcx, %rdx +; LIN-NEXT: movq %rdi, %xmm1 +; LIN-NEXT: movq %rsi, %xmm0 +; LIN-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; LIN-NEXT: movq %rdx, %xmm2 +; LIN-NEXT: movq %rax, %xmm1 +; LIN-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; LIN-NEXT: retq +; +; WIN-LABEL: old: +; WIN: # %bb.0: +; WIN-NEXT: movdqa (%rdx), %xmm0 +; WIN-NEXT: pand (%r8), %xmm0 +; WIN-NEXT: pextrq $1, %xmm0, %r8 +; WIN-NEXT: movq %r8, %rcx +; WIN-NEXT: shrq $32, %rcx +; WIN-NEXT: movq %xmm0, %rax +; WIN-NEXT: movq %rax, %rdx +; WIN-NEXT: shrq $32, %rdx +; WIN-NEXT: andl %r9d, %eax +; WIN-NEXT: andl %r9d, %r8d +; WIN-NEXT: andq %r9, %rdx +; WIN-NEXT: andq %r9, %rcx +; WIN-NEXT: movq %rdx, %xmm1 +; WIN-NEXT: movq %rax, %xmm0 +; WIN-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; WIN-NEXT: movq %rcx, %xmm2 +; WIN-NEXT: movq %r8, %xmm1 +; WIN-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; WIN-NEXT: retq +; +; LIN32-LABEL: old: +; LIN32: # %bb.0: +; LIN32-NEXT: pushl %ebp +; LIN32-NEXT: movl %esp, %ebp +; LIN32-NEXT: pushl %esi +; LIN32-NEXT: andl $-16, %esp +; LIN32-NEXT: subl $32, %esp +; LIN32-NEXT: movl 20(%ebp), %eax +; LIN32-NEXT: movl 16(%ebp), %ecx +; LIN32-NEXT: movl 12(%ebp), %edx +; LIN32-NEXT: movaps (%edx), %xmm0 +; LIN32-NEXT: andps (%ecx), %xmm0 +; LIN32-NEXT: movaps %xmm0, (%esp) +; LIN32-NEXT: movl (%esp), %ecx +; LIN32-NEXT: andl %eax, %ecx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; LIN32-NEXT: andl %eax, %edx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; LIN32-NEXT: andl %eax, %esi +; LIN32-NEXT: andl {{[0-9]+}}(%esp), %eax +; LIN32-NEXT: movd %edx, %xmm1 +; LIN32-NEXT: movd %ecx, %xmm0 +; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; LIN32-NEXT: movd %eax, %xmm2 +; LIN32-NEXT: movd %esi, %xmm1 +; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; LIN32-NEXT: leal -4(%ebp), %esp +; LIN32-NEXT: popl %esi +; LIN32-NEXT: popl %ebp +; LIN32-NEXT: retl %a = load <4 x i32>, <4 x i32>* %i %b = load <4 x i32>, <4 x i32>* %h %j = and <4 x i32> %a, %b @@ -77,7 +171,7 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind %q0 = zext i32 %d0 to i64 %q1 = zext i32 %d1 to i64 %q2 = zext i32 %d2 to i64 - %q3 = zext i32 %d3 to i64 + %q3 = zext i32 %d3 to i64 %r0 = and i64 %q0, %f %r1 = and i64 %q1, %f %r2 = and i64 %q2, %f diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index d318dde34434..d3521ca9f1e3 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -497,7 +497,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 @@ -510,7 +510,7 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 @@ -582,7 +582,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 @@ -595,7 +595,7 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1 +; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/setcc-combine.ll b/test/CodeGen/X86/setcc-combine.ll index a4a8e67d742c..56cff4ab6f2f 100644 --- a/test/CodeGen/X86/setcc-combine.ll +++ b/test/CodeGen/X86/setcc-combine.ll @@ -183,3 +183,27 @@ define i32 @test_gt_2(<4 x i32> %A, <4 x i32> %B) { ret i32 %t1 } +; (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) +; Don't combine with i1 - out of range constant +define void @test_i1_uge(i1 *%A2) { +; CHECK-LABEL: test_i1_uge: +; CHECK: # %bb.0: +; CHECK-NEXT: movb (%rdi), %al +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: xorb $1, %cl +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movb %cl, (%rdi,%rax) +; CHECK-NEXT: retq + %L5 = load i1, i1* %A2 + %C3 = icmp ne i1 %L5, true + %C8 = icmp eq i1 %L5, false + %C9 = icmp ugt i1 %C3, %C8 + %G3 = getelementptr i1, i1* %A2, i1 %C9 + store i1 %C3, i1* %G3 + ret void +} + diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index a516c709517d..ced3a40e4a46 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -112,13 +112,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl c, %esi ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -142,13 +143,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_4xi8: @@ -2215,13 +2217,7 @@ define void @PR34947() { ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl (%eax) ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm1 ; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 ; X86-SSE-NEXT: movd %eax, %xmm2 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 @@ -2415,13 +2411,7 @@ define void @PR34947() { ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rax) ; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm1 ; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 ; X64-SSE-NEXT: movd %eax, %xmm2 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 diff --git a/test/CodeGen/X86/slow-pmulld.ll b/test/CodeGen/X86/slow-pmulld.ll index 4d73b11349f5..325e6ee4085a 100644 --- a/test/CodeGen/X86/slow-pmulld.ll +++ b/test/CodeGen/X86/slow-pmulld.ll @@ -10,22 +10,14 @@ define <4 x i32> @foo(<4 x i8> %A) { ; CHECK32-LABEL: foo: ; CHECK32: # %bb.0: -; CHECK32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u] -; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; CHECK32-NEXT: movdqa %xmm0, %xmm2 -; CHECK32-NEXT: pmullw %xmm1, %xmm0 -; CHECK32-NEXT: pmulhw %xmm1, %xmm2 -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; CHECK32-NEXT: retl ; ; CHECK64-LABEL: foo: ; CHECK64: # %bb.0: -; CHECK64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u] -; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; CHECK64-NEXT: movdqa %xmm0, %xmm2 -; CHECK64-NEXT: pmullw %xmm1, %xmm0 -; CHECK64-NEXT: pmulhw %xmm1, %xmm2 -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: foo: diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index ad2edfe0959e..a789b861b7aa 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -5624,16 +5624,8 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; ATOM-LABEL: test_pmaddwd: ; ATOM: # %bb.0: -; ATOM-NEXT: pmaddwd %xmm1, %xmm0 -; ATOM-NEXT: pmaddwd (%rdi), %xmm0 -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] +; ATOM-NEXT: pmaddwd %xmm1, %xmm0 # sched: [5:5.00] +; ATOM-NEXT: pmaddwd (%rdi), %xmm0 # sched: [5:5.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_pmaddwd: @@ -6241,16 +6233,8 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; ATOM-LABEL: test_pmuludq: ; ATOM: # %bb.0: -; ATOM-NEXT: pmuludq %xmm1, %xmm0 -; ATOM-NEXT: pmuludq (%rdi), %xmm0 -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] +; ATOM-NEXT: pmuludq %xmm1, %xmm0 # sched: [5:5.00] +; ATOM-NEXT: pmuludq (%rdi), %xmm0 # sched: [5:5.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_pmuludq: @@ -6394,12 +6378,8 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; ATOM-LABEL: test_psadbw: ; ATOM: # %bb.0: -; ATOM-NEXT: psadbw %xmm1, %xmm0 -; ATOM-NEXT: psadbw (%rdi), %xmm0 -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] -; ATOM-NEXT: nop # sched: [1:0.50] +; ATOM-NEXT: psadbw %xmm1, %xmm0 # sched: [1:0.50] +; ATOM-NEXT: psadbw (%rdi), %xmm0 # sched: [1:1.00] ; ATOM-NEXT: nop # sched: [1:0.50] ; ATOM-NEXT: nop # sched: [1:0.50] ; ATOM-NEXT: nop # sched: [1:0.50] |